annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cfdft_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:50 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cfdft_20 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 286 FP additions, 188 FP multiplications,
Chris@42 32 * (or, 176 additions, 78 multiplications, 110 fused multiply/add),
Chris@42 33 * 174 stack variables, 5 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cfdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 43 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 44 {
Chris@42 45 INT m;
Chris@42 46 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 47 E T4X, T5i, T5k, T5e, T5c, T5d, T5j, T5f;
Chris@42 48 {
Chris@42 49 E T2E, T4W, T3v, T4k, T2M, T3w, T4V, T4j, T2p, T2T, T5a, T5A, T3D, T3o, T4b;
Chris@42 50 E T4B, T1Y, T2S, T5z, T57, T3h, T3C, T4A, T44, TH, T2P, T50, T5x, T3z, T32;
Chris@42 51 E T3P, T4D, T3V, T3U, T5w, T53, T2Q, T1o, T3A, T39;
Chris@42 52 {
Chris@42 53 E T1V, T9, T2w, Tu, T1, T6, T1R, T1U, T1T, T2Y, T5, T40, T2F, T10, T2C;
Chris@42 54 E TE, TX, T2m, T1y, T4g, TS, T33, TW, Tw, TB, T2y, T2B, TA, T3L, T2A;
Chris@42 55 E T3t, T1q, T1v, T2i, T2l, T2k, T3d, T1u, T48, Tm, Tr, T2s, T2v, T2u, T3J;
Chris@42 56 E Tq, T3r, T20, T1g, T23, T1l, T1h, T3S, T3k, T21, T2H, TL, T2K, TQ, TM;
Chris@42 57 E T35, T4h, T2I, T2f, T2g, T1I, T1D, T2c, T46, T2e, T3b, T1E, T28, T16, T29;
Chris@42 58 E T1b, T25, T3i, T27, T3Q, T17, T1O, T1P, Tj, T1M, Te, T1L, Tb, T3Y, TV;
Chris@42 59 E T1d, T1Z;
Chris@42 60 {
Chris@42 61 E T1S, T4, T7, T8;
Chris@42 62 T7 = Rp[WS(rs, 9)];
Chris@42 63 T8 = Rm[WS(rs, 9)];
Chris@42 64 {
Chris@42 65 E Ts, Tt, T2, T3;
Chris@42 66 Ts = Rp[WS(rs, 2)];
Chris@42 67 Tt = Rm[WS(rs, 2)];
Chris@42 68 T2 = Ip[WS(rs, 9)];
Chris@42 69 T1V = T7 + T8;
Chris@42 70 T9 = T7 - T8;
Chris@42 71 T2w = Ts - Tt;
Chris@42 72 Tu = Ts + Tt;
Chris@42 73 T3 = Im[WS(rs, 9)];
Chris@42 74 T1 = W[36];
Chris@42 75 T6 = W[37];
Chris@42 76 T1R = W[34];
Chris@42 77 T1S = T2 - T3;
Chris@42 78 T4 = T2 + T3;
Chris@42 79 T1U = W[35];
Chris@42 80 }
Chris@42 81 {
Chris@42 82 E TY, TZ, TC, TD;
Chris@42 83 TY = Ip[0];
Chris@42 84 T1T = T1R * T1S;
Chris@42 85 T2Y = T6 * T4;
Chris@42 86 T5 = T1 * T4;
Chris@42 87 T40 = T1U * T1S;
Chris@42 88 TZ = Im[0];
Chris@42 89 TC = Rp[WS(rs, 7)];
Chris@42 90 TD = Rm[WS(rs, 7)];
Chris@42 91 {
Chris@42 92 E T1w, T1x, TT, TU;
Chris@42 93 T1w = Rp[WS(rs, 1)];
Chris@42 94 T2F = TY - TZ;
Chris@42 95 T10 = TY + TZ;
Chris@42 96 T2C = TC - TD;
Chris@42 97 TE = TC + TD;
Chris@42 98 T1x = Rm[WS(rs, 1)];
Chris@42 99 TT = Rm[0];
Chris@42 100 TU = Rp[0];
Chris@42 101 TX = W[0];
Chris@42 102 T2m = T1w + T1x;
Chris@42 103 T1y = T1w - T1x;
Chris@42 104 T4g = TU + TT;
Chris@42 105 TV = TT - TU;
Chris@42 106 TS = W[1];
Chris@42 107 }
Chris@42 108 }
Chris@42 109 }
Chris@42 110 {
Chris@42 111 E T2j, T1t, T1r, T1s;
Chris@42 112 {
Chris@42 113 E Tx, Ty, T2z, Tz;
Chris@42 114 Tx = Ip[WS(rs, 7)];
Chris@42 115 Ty = Im[WS(rs, 7)];
Chris@42 116 T33 = TX * TV;
Chris@42 117 TW = TS * TV;
Chris@42 118 Tw = W[26];
Chris@42 119 T2z = Tx + Ty;
Chris@42 120 Tz = Tx - Ty;
Chris@42 121 TB = W[27];
Chris@42 122 T2y = W[28];
Chris@42 123 T2B = W[29];
Chris@42 124 TA = Tw * Tz;
Chris@42 125 T3L = TB * Tz;
Chris@42 126 T2A = T2y * T2z;
Chris@42 127 T3t = T2B * T2z;
Chris@42 128 }
Chris@42 129 T1r = Ip[WS(rs, 1)];
Chris@42 130 T1s = Im[WS(rs, 1)];
Chris@42 131 T1q = W[4];
Chris@42 132 T1v = W[5];
Chris@42 133 T2i = W[2];
Chris@42 134 T2j = T1r - T1s;
Chris@42 135 T1t = T1r + T1s;
Chris@42 136 T2l = W[3];
Chris@42 137 {
Chris@42 138 E T2t, Tp, Tn, To;
Chris@42 139 Tn = Ip[WS(rs, 2)];
Chris@42 140 T2k = T2i * T2j;
Chris@42 141 T3d = T1v * T1t;
Chris@42 142 T1u = T1q * T1t;
Chris@42 143 T48 = T2l * T2j;
Chris@42 144 To = Im[WS(rs, 2)];
Chris@42 145 Tm = W[6];
Chris@42 146 Tr = W[7];
Chris@42 147 T2s = W[8];
Chris@42 148 T2t = Tn + To;
Chris@42 149 Tp = Tn - To;
Chris@42 150 T2v = W[9];
Chris@42 151 {
Chris@42 152 E T1e, T1f, T1j, T1k;
Chris@42 153 T1e = Ip[WS(rs, 3)];
Chris@42 154 T2u = T2s * T2t;
Chris@42 155 T3J = Tr * Tp;
Chris@42 156 Tq = Tm * Tp;
Chris@42 157 T3r = T2v * T2t;
Chris@42 158 T1f = Im[WS(rs, 3)];
Chris@42 159 T1j = Rp[WS(rs, 3)];
Chris@42 160 T1k = Rm[WS(rs, 3)];
Chris@42 161 T1d = W[10];
Chris@42 162 T20 = T1e + T1f;
Chris@42 163 T1g = T1e - T1f;
Chris@42 164 T23 = T1j - T1k;
Chris@42 165 T1l = T1j + T1k;
Chris@42 166 T1Z = W[12];
Chris@42 167 T1h = T1d * T1g;
Chris@42 168 }
Chris@42 169 }
Chris@42 170 }
Chris@42 171 {
Chris@42 172 E T2d, T1A, TI, T2G, T26, T13;
Chris@42 173 {
Chris@42 174 E TJ, TK, TO, TP;
Chris@42 175 TJ = Ip[WS(rs, 5)];
Chris@42 176 T3S = T1d * T1l;
Chris@42 177 T3k = T1Z * T23;
Chris@42 178 T21 = T1Z * T20;
Chris@42 179 TK = Im[WS(rs, 5)];
Chris@42 180 TO = Rp[WS(rs, 5)];
Chris@42 181 TP = Rm[WS(rs, 5)];
Chris@42 182 TI = W[20];
Chris@42 183 T2H = TJ - TK;
Chris@42 184 TL = TJ + TK;
Chris@42 185 T2K = TO + TP;
Chris@42 186 TQ = TO - TP;
Chris@42 187 T2G = W[18];
Chris@42 188 TM = TI * TL;
Chris@42 189 }
Chris@42 190 {
Chris@42 191 E T1G, T1H, T1B, T1C;
Chris@42 192 T1G = Rm[WS(rs, 6)];
Chris@42 193 T35 = TI * TQ;
Chris@42 194 T4h = T2G * T2K;
Chris@42 195 T2I = T2G * T2H;
Chris@42 196 T1H = Rp[WS(rs, 6)];
Chris@42 197 T1B = Ip[WS(rs, 6)];
Chris@42 198 T1C = Im[WS(rs, 6)];
Chris@42 199 T2f = W[23];
Chris@42 200 T2g = T1H + T1G;
Chris@42 201 T1I = T1G - T1H;
Chris@42 202 T2d = T1B - T1C;
Chris@42 203 T1D = T1B + T1C;
Chris@42 204 T2c = W[22];
Chris@42 205 T1A = W[24];
Chris@42 206 T46 = T2f * T2d;
Chris@42 207 }
Chris@42 208 {
Chris@42 209 E T14, T15, T19, T1a;
Chris@42 210 T14 = Ip[WS(rs, 8)];
Chris@42 211 T2e = T2c * T2d;
Chris@42 212 T3b = T1A * T1I;
Chris@42 213 T1E = T1A * T1D;
Chris@42 214 T15 = Im[WS(rs, 8)];
Chris@42 215 T19 = Rp[WS(rs, 8)];
Chris@42 216 T1a = Rm[WS(rs, 8)];
Chris@42 217 T28 = W[32];
Chris@42 218 T16 = T14 - T15;
Chris@42 219 T29 = T14 + T15;
Chris@42 220 T1b = T19 + T1a;
Chris@42 221 T26 = T1a - T19;
Chris@42 222 T25 = W[33];
Chris@42 223 T13 = W[30];
Chris@42 224 T3i = T28 * T26;
Chris@42 225 }
Chris@42 226 {
Chris@42 227 E Th, Ti, Tc, Td;
Chris@42 228 Th = Rm[WS(rs, 4)];
Chris@42 229 T27 = T25 * T26;
Chris@42 230 T3Q = T13 * T1b;
Chris@42 231 T17 = T13 * T16;
Chris@42 232 Ti = Rp[WS(rs, 4)];
Chris@42 233 Tc = Ip[WS(rs, 4)];
Chris@42 234 Td = Im[WS(rs, 4)];
Chris@42 235 T1O = W[15];
Chris@42 236 T1P = Ti + Th;
Chris@42 237 Tj = Th - Ti;
Chris@42 238 T1M = Tc - Td;
Chris@42 239 Te = Tc + Td;
Chris@42 240 T1L = W[14];
Chris@42 241 Tb = W[16];
Chris@42 242 T3Y = T1O * T1M;
Chris@42 243 }
Chris@42 244 }
Chris@42 245 {
Chris@42 246 E T1N, T2W, Tf, T2L, T4i;
Chris@42 247 {
Chris@42 248 E T2x, T2D, T3s, T3u, T2J;
Chris@42 249 T2x = FNMS(T2v, T2w, T2u);
Chris@42 250 T1N = T1L * T1M;
Chris@42 251 T2W = Tb * Tj;
Chris@42 252 Tf = Tb * Te;
Chris@42 253 T2D = FNMS(T2B, T2C, T2A);
Chris@42 254 T3s = FMA(T2s, T2w, T3r);
Chris@42 255 T3u = FMA(T2y, T2C, T3t);
Chris@42 256 T2J = W[19];
Chris@42 257 T2E = T2x - T2D;
Chris@42 258 T4W = T2x + T2D;
Chris@42 259 T3v = T3s + T3u;
Chris@42 260 T4k = T3u - T3s;
Chris@42 261 T2L = FNMS(T2J, T2K, T2I);
Chris@42 262 T4i = FMA(T2J, T2H, T4h);
Chris@42 263 }
Chris@42 264 {
Chris@42 265 E T42, T43, T45, T4a, T3O, T3N;
Chris@42 266 {
Chris@42 267 E T2a, T3j, T47, T3l, T24, T2o, T3n, T49, T22, T2h, T2n;
Chris@42 268 T2a = FMA(T28, T29, T27);
Chris@42 269 T3j = FNMS(T25, T29, T3i);
Chris@42 270 T2M = T2F - T2L;
Chris@42 271 T3w = T2L + T2F;
Chris@42 272 T4V = T4g + T4i;
Chris@42 273 T4j = T4g - T4i;
Chris@42 274 T22 = W[13];
Chris@42 275 T2h = FNMS(T2f, T2g, T2e);
Chris@42 276 T2n = FNMS(T2l, T2m, T2k);
Chris@42 277 T47 = FMA(T2c, T2g, T46);
Chris@42 278 T3l = FMA(T22, T20, T3k);
Chris@42 279 T24 = FNMS(T22, T23, T21);
Chris@42 280 T2o = T2h - T2n;
Chris@42 281 T3n = T2h + T2n;
Chris@42 282 T49 = FMA(T2i, T2m, T48);
Chris@42 283 {
Chris@42 284 E T2b, T58, T3m, T59;
Chris@42 285 T2b = T24 - T2a;
Chris@42 286 T58 = T2a + T24;
Chris@42 287 T3m = T3j - T3l;
Chris@42 288 T45 = T3j + T3l;
Chris@42 289 T4a = T47 - T49;
Chris@42 290 T59 = T47 + T49;
Chris@42 291 T2p = T2b - T2o;
Chris@42 292 T2T = T2b + T2o;
Chris@42 293 T5a = T58 + T59;
Chris@42 294 T5A = T59 - T58;
Chris@42 295 T3D = T3m + T3n;
Chris@42 296 T3o = T3m - T3n;
Chris@42 297 }
Chris@42 298 }
Chris@42 299 {
Chris@42 300 E T1z, T3e, T1Q, T3c, T1J, T1W, T3Z, T41, T1F;
Chris@42 301 T1z = FNMS(T1v, T1y, T1u);
Chris@42 302 T3e = FMA(T1q, T1y, T3d);
Chris@42 303 T1F = W[25];
Chris@42 304 T4b = T45 + T4a;
Chris@42 305 T4B = T4a - T45;
Chris@42 306 T1Q = FNMS(T1O, T1P, T1N);
Chris@42 307 T3c = FNMS(T1F, T1D, T3b);
Chris@42 308 T1J = FMA(T1F, T1I, T1E);
Chris@42 309 T1W = FNMS(T1U, T1V, T1T);
Chris@42 310 T3Z = FMA(T1L, T1P, T3Y);
Chris@42 311 T41 = FMA(T1R, T1V, T40);
Chris@42 312 {
Chris@42 313 E T56, T3g, T55, T1K, T1X, T3f;
Chris@42 314 T56 = T1J + T1z;
Chris@42 315 T1K = T1z - T1J;
Chris@42 316 T3g = T1Q + T1W;
Chris@42 317 T1X = T1Q - T1W;
Chris@42 318 T55 = T3Z + T41;
Chris@42 319 T42 = T3Z - T41;
Chris@42 320 T1Y = T1K - T1X;
Chris@42 321 T2S = T1X + T1K;
Chris@42 322 T43 = T3c + T3e;
Chris@42 323 T3f = T3c - T3e;
Chris@42 324 T5z = T55 - T56;
Chris@42 325 T57 = T55 + T56;
Chris@42 326 T3h = T3f - T3g;
Chris@42 327 T3C = T3g + T3f;
Chris@42 328 }
Chris@42 329 }
Chris@42 330 {
Chris@42 331 E Ta, T2Z, T3K, T2X, Tk, TG, T31, T3M, Tg, Tv, TF;
Chris@42 332 Ta = FNMS(T6, T9, T5);
Chris@42 333 T4A = T42 - T43;
Chris@42 334 T44 = T42 + T43;
Chris@42 335 T2Z = FMA(T1, T9, T2Y);
Chris@42 336 Tg = W[17];
Chris@42 337 Tv = FNMS(Tr, Tu, Tq);
Chris@42 338 TF = FNMS(TB, TE, TA);
Chris@42 339 T3K = FMA(Tm, Tu, T3J);
Chris@42 340 T2X = FNMS(Tg, Te, T2W);
Chris@42 341 Tk = FMA(Tg, Tj, Tf);
Chris@42 342 TG = Tv - TF;
Chris@42 343 T31 = Tv + TF;
Chris@42 344 T3M = FMA(Tw, TE, T3L);
Chris@42 345 {
Chris@42 346 E Tl, T4Z, T30, T4Y;
Chris@42 347 Tl = Ta - Tk;
Chris@42 348 T4Z = Tk + Ta;
Chris@42 349 T30 = T2X - T2Z;
Chris@42 350 T3O = T2X + T2Z;
Chris@42 351 T3N = T3K - T3M;
Chris@42 352 T4Y = T3K + T3M;
Chris@42 353 TH = Tl - TG;
Chris@42 354 T2P = TG + Tl;
Chris@42 355 T50 = T4Y + T4Z;
Chris@42 356 T5x = T4Y - T4Z;
Chris@42 357 T3z = T31 + T30;
Chris@42 358 T32 = T30 - T31;
Chris@42 359 }
Chris@42 360 }
Chris@42 361 {
Chris@42 362 E T11, T34, T36, TR, T1i, T3R, T1c, TN, T18;
Chris@42 363 T11 = FMA(TX, T10, TW);
Chris@42 364 T34 = FNMS(TS, T10, T33);
Chris@42 365 TN = W[21];
Chris@42 366 T3P = T3N + T3O;
Chris@42 367 T4D = T3N - T3O;
Chris@42 368 T18 = W[31];
Chris@42 369 T36 = FMA(TN, TL, T35);
Chris@42 370 TR = FNMS(TN, TQ, TM);
Chris@42 371 T1i = W[11];
Chris@42 372 T3R = FMA(T18, T16, T3Q);
Chris@42 373 T1c = FNMS(T18, T1b, T17);
Chris@42 374 {
Chris@42 375 E T52, T12, T3T, T1m;
Chris@42 376 T52 = TR + T11;
Chris@42 377 T12 = TR - T11;
Chris@42 378 T3T = FMA(T1i, T1g, T3S);
Chris@42 379 T1m = FNMS(T1i, T1l, T1h);
Chris@42 380 {
Chris@42 381 E T37, T51, T38, T1n;
Chris@42 382 T3V = T36 + T34;
Chris@42 383 T37 = T34 - T36;
Chris@42 384 T51 = T3R + T3T;
Chris@42 385 T3U = T3R - T3T;
Chris@42 386 T38 = T1c + T1m;
Chris@42 387 T1n = T1c - T1m;
Chris@42 388 T5w = T51 - T52;
Chris@42 389 T53 = T51 + T52;
Chris@42 390 T2Q = T1n + T12;
Chris@42 391 T1o = T12 - T1n;
Chris@42 392 T3A = T38 + T37;
Chris@42 393 T39 = T37 - T38;
Chris@42 394 }
Chris@42 395 }
Chris@42 396 }
Chris@42 397 }
Chris@42 398 }
Chris@42 399 }
Chris@42 400 {
Chris@42 401 E T4l, T4m, T4n, T4w, T4u;
Chris@42 402 {
Chris@42 403 E T4L, T2O, T3W, T4K, T4I, T4G, T4S, T4U, T4J, T4z, T4H;
Chris@42 404 {
Chris@42 405 E T4C, T2N, T4R, T1p, T4E, T2q, T4Q;
Chris@42 406 T4L = T4A + T4B;
Chris@42 407 T4C = T4A - T4B;
Chris@42 408 T2N = T2E + T2M;
Chris@42 409 T2O = T2M - T2E;
Chris@42 410 T4R = T1o - TH;
Chris@42 411 T1p = TH + T1o;
Chris@42 412 T4E = T3U - T3V;
Chris@42 413 T3W = T3U + T3V;
Chris@42 414 T2q = T1Y + T2p;
Chris@42 415 T4Q = T2p - T1Y;
Chris@42 416 {
Chris@42 417 E T4y, T4x, T4F, T2r;
Chris@42 418 T4F = T4D - T4E;
Chris@42 419 T4K = T4D + T4E;
Chris@42 420 T4y = T1p - T2q;
Chris@42 421 T2r = T1p + T2q;
Chris@42 422 T4I = FMA(KP618033988, T4C, T4F);
Chris@42 423 T4G = FNMS(KP618033988, T4F, T4C);
Chris@42 424 T4S = FNMS(KP618033988, T4R, T4Q);
Chris@42 425 T4U = FMA(KP618033988, T4Q, T4R);
Chris@42 426 Im[WS(rs, 4)] = KP500000000 * (T2r - T2N);
Chris@42 427 T4x = FMA(KP250000000, T2r, T2N);
Chris@42 428 T4J = T4j - T4k;
Chris@42 429 T4l = T4j + T4k;
Chris@42 430 T4z = FMA(KP559016994, T4y, T4x);
Chris@42 431 T4H = FNMS(KP559016994, T4y, T4x);
Chris@42 432 }
Chris@42 433 }
Chris@42 434 {
Chris@42 435 E T2R, T4s, T4d, T4f, T4t, T2U, T4P, T4T;
Chris@42 436 {
Chris@42 437 E T3X, T4O, T4M, T4c, T4N;
Chris@42 438 T4m = T3P + T3W;
Chris@42 439 T3X = T3P - T3W;
Chris@42 440 Ip[WS(rs, 7)] = KP500000000 * (FMA(KP951056516, T4G, T4z));
Chris@42 441 Ip[WS(rs, 3)] = KP500000000 * (FNMS(KP951056516, T4G, T4z));
Chris@42 442 Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP951056516, T4I, T4H)));
Chris@42 443 Im[0] = -(KP500000000 * (FMA(KP951056516, T4I, T4H)));
Chris@42 444 T4O = T4K - T4L;
Chris@42 445 T4M = T4K + T4L;
Chris@42 446 T4c = T44 - T4b;
Chris@42 447 T4n = T44 + T4b;
Chris@42 448 T2R = T2P + T2Q;
Chris@42 449 T4s = T2P - T2Q;
Chris@42 450 Rm[WS(rs, 4)] = KP500000000 * (T4J + T4M);
Chris@42 451 T4N = FNMS(KP250000000, T4M, T4J);
Chris@42 452 T4d = FMA(KP618033988, T4c, T3X);
Chris@42 453 T4f = FNMS(KP618033988, T3X, T4c);
Chris@42 454 T4t = T2S - T2T;
Chris@42 455 T2U = T2S + T2T;
Chris@42 456 T4P = FNMS(KP559016994, T4O, T4N);
Chris@42 457 T4T = FMA(KP559016994, T4O, T4N);
Chris@42 458 }
Chris@42 459 {
Chris@42 460 E T3H, T3G, T2V, T3I, T4e;
Chris@42 461 T2V = T2R + T2U;
Chris@42 462 T3H = T2R - T2U;
Chris@42 463 Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T4S, T4P));
Chris@42 464 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T4S, T4P));
Chris@42 465 Rm[0] = KP500000000 * (FNMS(KP951056516, T4U, T4T));
Chris@42 466 Rm[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T4U, T4T));
Chris@42 467 Ip[WS(rs, 5)] = KP500000000 * (T2O + T2V);
Chris@42 468 T3G = FNMS(KP250000000, T2V, T2O);
Chris@42 469 T3I = FMA(KP559016994, T3H, T3G);
Chris@42 470 T4e = FNMS(KP559016994, T3H, T3G);
Chris@42 471 T4w = FNMS(KP618033988, T4s, T4t);
Chris@42 472 T4u = FMA(KP618033988, T4t, T4s);
Chris@42 473 Ip[WS(rs, 9)] = KP500000000 * (FMA(KP951056516, T4d, T3I));
Chris@42 474 Ip[WS(rs, 1)] = KP500000000 * (FNMS(KP951056516, T4d, T3I));
Chris@42 475 Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP951056516, T4f, T4e)));
Chris@42 476 Im[WS(rs, 2)] = -(KP500000000 * (FMA(KP951056516, T4f, T4e)));
Chris@42 477 }
Chris@42 478 }
Chris@42 479 }
Chris@42 480 {
Chris@42 481 E T3y, T5O, T5Q, T5F, T5K, T5I;
Chris@42 482 {
Chris@42 483 E T5G, T5H, T3x, T4q, T5E, T5C, T3a, T5N, T4p, T5M, T3p, T5y, T5B, T4o;
Chris@42 484 T5G = T5x + T5w;
Chris@42 485 T5y = T5w - T5x;
Chris@42 486 T5B = T5z - T5A;
Chris@42 487 T5H = T5z + T5A;
Chris@42 488 T3y = T3w - T3v;
Chris@42 489 T3x = T3v + T3w;
Chris@42 490 T4q = T4m - T4n;
Chris@42 491 T4o = T4m + T4n;
Chris@42 492 T5E = FMA(KP618033988, T5y, T5B);
Chris@42 493 T5C = FNMS(KP618033988, T5B, T5y);
Chris@42 494 T3a = T32 + T39;
Chris@42 495 T5N = T39 - T32;
Chris@42 496 Rp[WS(rs, 5)] = KP500000000 * (T4l + T4o);
Chris@42 497 T4p = FNMS(KP250000000, T4o, T4l);
Chris@42 498 T5M = T3o - T3h;
Chris@42 499 T3p = T3h + T3o;
Chris@42 500 {
Chris@42 501 E T5u, T5t, T4r, T4v, T3q, T5D, T5v;
Chris@42 502 T4r = FMA(KP559016994, T4q, T4p);
Chris@42 503 T4v = FNMS(KP559016994, T4q, T4p);
Chris@42 504 T5u = T3p - T3a;
Chris@42 505 T3q = T3a + T3p;
Chris@42 506 Rp[WS(rs, 9)] = KP500000000 * (FNMS(KP951056516, T4u, T4r));
Chris@42 507 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T4u, T4r));
Chris@42 508 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T4w, T4v));
Chris@42 509 Rm[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T4w, T4v));
Chris@42 510 Im[WS(rs, 9)] = KP500000000 * (T3q - T3x);
Chris@42 511 T5t = FMA(KP250000000, T3q, T3x);
Chris@42 512 T5O = FNMS(KP618033988, T5N, T5M);
Chris@42 513 T5Q = FMA(KP618033988, T5M, T5N);
Chris@42 514 T5F = T4V - T4W;
Chris@42 515 T4X = T4V + T4W;
Chris@42 516 T5D = FNMS(KP559016994, T5u, T5t);
Chris@42 517 T5v = FMA(KP559016994, T5u, T5t);
Chris@42 518 Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP951056516, T5C, T5v)));
Chris@42 519 Ip[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5C, T5v));
Chris@42 520 Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T5E, T5D)));
Chris@42 521 Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T5E, T5D));
Chris@42 522 T5K = T5G - T5H;
Chris@42 523 T5I = T5G + T5H;
Chris@42 524 }
Chris@42 525 }
Chris@42 526 {
Chris@42 527 E T54, T5b, T5s, T5q, T5g, T5h, T3F, T5m, T5o, T5p, T5J, T5l, T5r, T5n;
Chris@42 528 T54 = T50 + T53;
Chris@42 529 T5o = T50 - T53;
Chris@42 530 T5p = T5a - T57;
Chris@42 531 T5b = T57 + T5a;
Chris@42 532 Rm[WS(rs, 9)] = KP500000000 * (T5F + T5I);
Chris@42 533 T5J = FNMS(KP250000000, T5I, T5F);
Chris@42 534 T5s = FMA(KP618033988, T5o, T5p);
Chris@42 535 T5q = FNMS(KP618033988, T5p, T5o);
Chris@42 536 {
Chris@42 537 E T5L, T5P, T3B, T3E;
Chris@42 538 T5L = FNMS(KP559016994, T5K, T5J);
Chris@42 539 T5P = FMA(KP559016994, T5K, T5J);
Chris@42 540 T3B = T3z + T3A;
Chris@42 541 T5g = T3z - T3A;
Chris@42 542 T5h = T3C - T3D;
Chris@42 543 T3E = T3C + T3D;
Chris@42 544 Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T5O, T5L));
Chris@42 545 Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T5O, T5L));
Chris@42 546 Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP951056516, T5Q, T5P));
Chris@42 547 Rp[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5Q, T5P));
Chris@42 548 T3F = T3B + T3E;
Chris@42 549 T5m = T3B - T3E;
Chris@42 550 }
Chris@42 551 Ip[0] = KP500000000 * (T3y + T3F);
Chris@42 552 T5l = FNMS(KP250000000, T3F, T3y);
Chris@42 553 T5i = FMA(KP618033988, T5h, T5g);
Chris@42 554 T5k = FNMS(KP618033988, T5g, T5h);
Chris@42 555 T5r = FNMS(KP559016994, T5m, T5l);
Chris@42 556 T5n = FMA(KP559016994, T5m, T5l);
Chris@42 557 Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T5q, T5n)));
Chris@42 558 Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T5q, T5n));
Chris@42 559 Im[WS(rs, 7)] = -(KP500000000 * (FNMS(KP951056516, T5s, T5r)));
Chris@42 560 Ip[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5s, T5r));
Chris@42 561 T5e = T54 - T5b;
Chris@42 562 T5c = T54 + T5b;
Chris@42 563 }
Chris@42 564 }
Chris@42 565 }
Chris@42 566 }
Chris@42 567 Rp[0] = KP500000000 * (T4X + T5c);
Chris@42 568 T5d = FNMS(KP250000000, T5c, T4X);
Chris@42 569 T5j = FNMS(KP559016994, T5e, T5d);
Chris@42 570 T5f = FMA(KP559016994, T5e, T5d);
Chris@42 571 Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T5i, T5f));
Chris@42 572 Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T5i, T5f));
Chris@42 573 Rm[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T5k, T5j));
Chris@42 574 Rp[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5k, T5j));
Chris@42 575 }
Chris@42 576 }
Chris@42 577 }
Chris@42 578
Chris@42 579 static const tw_instr twinstr[] = {
Chris@42 580 {TW_FULL, 1, 20},
Chris@42 581 {TW_NEXT, 1, 0}
Chris@42 582 };
Chris@42 583
Chris@42 584 static const hc2c_desc desc = { 20, "hc2cfdft_20", twinstr, &GENUS, {176, 78, 110, 0} };
Chris@42 585
Chris@42 586 void X(codelet_hc2cfdft_20) (planner *p) {
Chris@42 587 X(khc2c_register) (p, hc2cfdft_20, &desc, HC2C_VIA_DFT);
Chris@42 588 }
Chris@42 589 #else /* HAVE_FMA */
Chris@42 590
Chris@42 591 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cfdft_20 -include hc2cf.h */
Chris@42 592
Chris@42 593 /*
Chris@42 594 * This function contains 286 FP additions, 140 FP multiplications,
Chris@42 595 * (or, 224 additions, 78 multiplications, 62 fused multiply/add),
Chris@42 596 * 98 stack variables, 5 constants, and 80 memory accesses
Chris@42 597 */
Chris@42 598 #include "hc2cf.h"
Chris@42 599
Chris@42 600 static void hc2cfdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 601 {
Chris@42 602 DK(KP125000000, +0.125000000000000000000000000000000000000000000);
Chris@42 603 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 604 DK(KP279508497, +0.279508497187473712051146708591409529430077295);
Chris@42 605 DK(KP293892626, +0.293892626146236564584352977319536384298826219);
Chris@42 606 DK(KP475528258, +0.475528258147576786058219666689691071702849317);
Chris@42 607 {
Chris@42 608 INT m;
Chris@42 609 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 610 E T12, T2w, T4o, T4V, T2H, T3a, T4y, T4Y, T1z, T2v, T25, T2y, T2s, T2z, T4v;
Chris@42 611 E T4X, T4r, T4U, T3A, T3Z, T2X, T37, T3k, T41, T2M, T39, T3v, T3Y, T2S, T36;
Chris@42 612 E T3p, T42, Td, T4G, T33, T3N, Tw, T4H, T32, T3O;
Chris@42 613 {
Chris@42 614 E T3, T3L, T1x, T2V, Th, Tl, TC, T3g, Tq, Tu, TH, T3h, T7, Tb, T1q;
Chris@42 615 E T2U, TR, T2P, T1F, T3r, T23, T2K, T2f, T3y, T1k, T3m, T2q, T2E, T10, T2Q;
Chris@42 616 E T1K, T3s, T1U, T2J, T2a, T3x, T1b, T3l, T2l, T2D;
Chris@42 617 {
Chris@42 618 E T1, T2, T1s, T1u, T1v, T1w, T1r, T1t;
Chris@42 619 T1 = Ip[0];
Chris@42 620 T2 = Im[0];
Chris@42 621 T1s = T1 + T2;
Chris@42 622 T1u = Rp[0];
Chris@42 623 T1v = Rm[0];
Chris@42 624 T1w = T1u - T1v;
Chris@42 625 T3 = T1 - T2;
Chris@42 626 T3L = T1u + T1v;
Chris@42 627 T1r = W[0];
Chris@42 628 T1t = W[1];
Chris@42 629 T1x = FNMS(T1t, T1w, T1r * T1s);
Chris@42 630 T2V = FMA(T1r, T1w, T1t * T1s);
Chris@42 631 }
Chris@42 632 {
Chris@42 633 E Tf, Tg, Tz, Tj, Tk, TB, Ty, TA;
Chris@42 634 Tf = Ip[WS(rs, 2)];
Chris@42 635 Tg = Im[WS(rs, 2)];
Chris@42 636 Tz = Tf - Tg;
Chris@42 637 Tj = Rp[WS(rs, 2)];
Chris@42 638 Tk = Rm[WS(rs, 2)];
Chris@42 639 TB = Tj + Tk;
Chris@42 640 Th = Tf + Tg;
Chris@42 641 Tl = Tj - Tk;
Chris@42 642 Ty = W[6];
Chris@42 643 TA = W[7];
Chris@42 644 TC = FNMS(TA, TB, Ty * Tz);
Chris@42 645 T3g = FMA(TA, Tz, Ty * TB);
Chris@42 646 }
Chris@42 647 {
Chris@42 648 E To, Tp, TE, Ts, Tt, TG, TD, TF;
Chris@42 649 To = Ip[WS(rs, 7)];
Chris@42 650 Tp = Im[WS(rs, 7)];
Chris@42 651 TE = To - Tp;
Chris@42 652 Ts = Rp[WS(rs, 7)];
Chris@42 653 Tt = Rm[WS(rs, 7)];
Chris@42 654 TG = Ts + Tt;
Chris@42 655 Tq = To + Tp;
Chris@42 656 Tu = Ts - Tt;
Chris@42 657 TD = W[26];
Chris@42 658 TF = W[27];
Chris@42 659 TH = FNMS(TF, TG, TD * TE);
Chris@42 660 T3h = FMA(TF, TE, TD * TG);
Chris@42 661 }
Chris@42 662 {
Chris@42 663 E T5, T6, T1n, T9, Ta, T1p, T1m, T1o;
Chris@42 664 T5 = Ip[WS(rs, 5)];
Chris@42 665 T6 = Im[WS(rs, 5)];
Chris@42 666 T1n = T5 + T6;
Chris@42 667 T9 = Rp[WS(rs, 5)];
Chris@42 668 Ta = Rm[WS(rs, 5)];
Chris@42 669 T1p = T9 - Ta;
Chris@42 670 T7 = T5 - T6;
Chris@42 671 Tb = T9 + Ta;
Chris@42 672 T1m = W[20];
Chris@42 673 T1o = W[21];
Chris@42 674 T1q = FNMS(T1o, T1p, T1m * T1n);
Chris@42 675 T2U = FMA(T1m, T1p, T1o * T1n);
Chris@42 676 }
Chris@42 677 {
Chris@42 678 E TM, T1C, TQ, T1E;
Chris@42 679 {
Chris@42 680 E TK, TL, TO, TP;
Chris@42 681 TK = Ip[WS(rs, 4)];
Chris@42 682 TL = Im[WS(rs, 4)];
Chris@42 683 TM = TK + TL;
Chris@42 684 T1C = TK - TL;
Chris@42 685 TO = Rp[WS(rs, 4)];
Chris@42 686 TP = Rm[WS(rs, 4)];
Chris@42 687 TQ = TO - TP;
Chris@42 688 T1E = TO + TP;
Chris@42 689 }
Chris@42 690 {
Chris@42 691 E TJ, TN, T1B, T1D;
Chris@42 692 TJ = W[16];
Chris@42 693 TN = W[17];
Chris@42 694 TR = FNMS(TN, TQ, TJ * TM);
Chris@42 695 T2P = FMA(TN, TM, TJ * TQ);
Chris@42 696 T1B = W[14];
Chris@42 697 T1D = W[15];
Chris@42 698 T1F = FNMS(T1D, T1E, T1B * T1C);
Chris@42 699 T3r = FMA(T1D, T1C, T1B * T1E);
Chris@42 700 }
Chris@42 701 }
Chris@42 702 {
Chris@42 703 E T1Y, T2c, T22, T2e;
Chris@42 704 {
Chris@42 705 E T1W, T1X, T20, T21;
Chris@42 706 T1W = Ip[WS(rs, 1)];
Chris@42 707 T1X = Im[WS(rs, 1)];
Chris@42 708 T1Y = T1W + T1X;
Chris@42 709 T2c = T1W - T1X;
Chris@42 710 T20 = Rp[WS(rs, 1)];
Chris@42 711 T21 = Rm[WS(rs, 1)];
Chris@42 712 T22 = T20 - T21;
Chris@42 713 T2e = T20 + T21;
Chris@42 714 }
Chris@42 715 {
Chris@42 716 E T1V, T1Z, T2b, T2d;
Chris@42 717 T1V = W[4];
Chris@42 718 T1Z = W[5];
Chris@42 719 T23 = FNMS(T1Z, T22, T1V * T1Y);
Chris@42 720 T2K = FMA(T1Z, T1Y, T1V * T22);
Chris@42 721 T2b = W[2];
Chris@42 722 T2d = W[3];
Chris@42 723 T2f = FNMS(T2d, T2e, T2b * T2c);
Chris@42 724 T3y = FMA(T2d, T2c, T2b * T2e);
Chris@42 725 }
Chris@42 726 }
Chris@42 727 {
Chris@42 728 E T1f, T2n, T1j, T2p;
Chris@42 729 {
Chris@42 730 E T1d, T1e, T1h, T1i;
Chris@42 731 T1d = Ip[WS(rs, 3)];
Chris@42 732 T1e = Im[WS(rs, 3)];
Chris@42 733 T1f = T1d - T1e;
Chris@42 734 T2n = T1d + T1e;
Chris@42 735 T1h = Rp[WS(rs, 3)];
Chris@42 736 T1i = Rm[WS(rs, 3)];
Chris@42 737 T1j = T1h + T1i;
Chris@42 738 T2p = T1h - T1i;
Chris@42 739 }
Chris@42 740 {
Chris@42 741 E T1c, T1g, T2m, T2o;
Chris@42 742 T1c = W[10];
Chris@42 743 T1g = W[11];
Chris@42 744 T1k = FNMS(T1g, T1j, T1c * T1f);
Chris@42 745 T3m = FMA(T1c, T1j, T1g * T1f);
Chris@42 746 T2m = W[12];
Chris@42 747 T2o = W[13];
Chris@42 748 T2q = FNMS(T2o, T2p, T2m * T2n);
Chris@42 749 T2E = FMA(T2m, T2p, T2o * T2n);
Chris@42 750 }
Chris@42 751 }
Chris@42 752 {
Chris@42 753 E TV, T1H, TZ, T1J;
Chris@42 754 {
Chris@42 755 E TT, TU, TX, TY;
Chris@42 756 TT = Ip[WS(rs, 9)];
Chris@42 757 TU = Im[WS(rs, 9)];
Chris@42 758 TV = TT + TU;
Chris@42 759 T1H = TT - TU;
Chris@42 760 TX = Rp[WS(rs, 9)];
Chris@42 761 TY = Rm[WS(rs, 9)];
Chris@42 762 TZ = TX - TY;
Chris@42 763 T1J = TX + TY;
Chris@42 764 }
Chris@42 765 {
Chris@42 766 E TS, TW, T1G, T1I;
Chris@42 767 TS = W[36];
Chris@42 768 TW = W[37];
Chris@42 769 T10 = FNMS(TW, TZ, TS * TV);
Chris@42 770 T2Q = FMA(TW, TV, TS * TZ);
Chris@42 771 T1G = W[34];
Chris@42 772 T1I = W[35];
Chris@42 773 T1K = FNMS(T1I, T1J, T1G * T1H);
Chris@42 774 T3s = FMA(T1I, T1H, T1G * T1J);
Chris@42 775 }
Chris@42 776 }
Chris@42 777 {
Chris@42 778 E T1P, T27, T1T, T29;
Chris@42 779 {
Chris@42 780 E T1N, T1O, T1R, T1S;
Chris@42 781 T1N = Ip[WS(rs, 6)];
Chris@42 782 T1O = Im[WS(rs, 6)];
Chris@42 783 T1P = T1N + T1O;
Chris@42 784 T27 = T1N - T1O;
Chris@42 785 T1R = Rp[WS(rs, 6)];
Chris@42 786 T1S = Rm[WS(rs, 6)];
Chris@42 787 T1T = T1R - T1S;
Chris@42 788 T29 = T1R + T1S;
Chris@42 789 }
Chris@42 790 {
Chris@42 791 E T1M, T1Q, T26, T28;
Chris@42 792 T1M = W[24];
Chris@42 793 T1Q = W[25];
Chris@42 794 T1U = FNMS(T1Q, T1T, T1M * T1P);
Chris@42 795 T2J = FMA(T1Q, T1P, T1M * T1T);
Chris@42 796 T26 = W[22];
Chris@42 797 T28 = W[23];
Chris@42 798 T2a = FNMS(T28, T29, T26 * T27);
Chris@42 799 T3x = FMA(T28, T27, T26 * T29);
Chris@42 800 }
Chris@42 801 }
Chris@42 802 {
Chris@42 803 E T16, T2k, T1a, T2i;
Chris@42 804 {
Chris@42 805 E T14, T15, T18, T19;
Chris@42 806 T14 = Ip[WS(rs, 8)];
Chris@42 807 T15 = Im[WS(rs, 8)];
Chris@42 808 T16 = T14 - T15;
Chris@42 809 T2k = T14 + T15;
Chris@42 810 T18 = Rp[WS(rs, 8)];
Chris@42 811 T19 = Rm[WS(rs, 8)];
Chris@42 812 T1a = T18 + T19;
Chris@42 813 T2i = T19 - T18;
Chris@42 814 }
Chris@42 815 {
Chris@42 816 E T13, T17, T2h, T2j;
Chris@42 817 T13 = W[30];
Chris@42 818 T17 = W[31];
Chris@42 819 T1b = FNMS(T17, T1a, T13 * T16);
Chris@42 820 T3l = FMA(T13, T1a, T17 * T16);
Chris@42 821 T2h = W[33];
Chris@42 822 T2j = W[32];
Chris@42 823 T2l = FMA(T2h, T2i, T2j * T2k);
Chris@42 824 T2D = FNMS(T2h, T2k, T2j * T2i);
Chris@42 825 }
Chris@42 826 }
Chris@42 827 {
Chris@42 828 E T2g, T2r, T3n, T3o;
Chris@42 829 {
Chris@42 830 E TI, T11, T4m, T4n;
Chris@42 831 TI = TC - TH;
Chris@42 832 T11 = TR - T10;
Chris@42 833 T12 = TI - T11;
Chris@42 834 T2w = TI + T11;
Chris@42 835 T4m = T3g + T3h;
Chris@42 836 T4n = TR + T10;
Chris@42 837 T4o = T4m + T4n;
Chris@42 838 T4V = T4m - T4n;
Chris@42 839 }
Chris@42 840 {
Chris@42 841 E T2F, T2G, T4w, T4x;
Chris@42 842 T2F = T2D - T2E;
Chris@42 843 T2G = T2a + T2f;
Chris@42 844 T2H = T2F - T2G;
Chris@42 845 T3a = T2F + T2G;
Chris@42 846 T4w = T2l + T2q;
Chris@42 847 T4x = T3x + T3y;
Chris@42 848 T4y = T4w + T4x;
Chris@42 849 T4Y = T4x - T4w;
Chris@42 850 }
Chris@42 851 {
Chris@42 852 E T1l, T1y, T1L, T24;
Chris@42 853 T1l = T1b - T1k;
Chris@42 854 T1y = T1q - T1x;
Chris@42 855 T1z = T1l + T1y;
Chris@42 856 T2v = T1y - T1l;
Chris@42 857 T1L = T1F - T1K;
Chris@42 858 T24 = T1U - T23;
Chris@42 859 T25 = T1L - T24;
Chris@42 860 T2y = T1L + T24;
Chris@42 861 }
Chris@42 862 T2g = T2a - T2f;
Chris@42 863 T2r = T2l - T2q;
Chris@42 864 T2s = T2g - T2r;
Chris@42 865 T2z = T2r + T2g;
Chris@42 866 {
Chris@42 867 E T4t, T4u, T4p, T4q;
Chris@42 868 T4t = T3r + T3s;
Chris@42 869 T4u = T1U + T23;
Chris@42 870 T4v = T4t + T4u;
Chris@42 871 T4X = T4t - T4u;
Chris@42 872 T4p = T3l + T3m;
Chris@42 873 T4q = T1q + T1x;
Chris@42 874 T4r = T4p + T4q;
Chris@42 875 T4U = T4p - T4q;
Chris@42 876 }
Chris@42 877 {
Chris@42 878 E T3w, T3z, T2T, T2W;
Chris@42 879 T3w = T2D + T2E;
Chris@42 880 T3z = T3x - T3y;
Chris@42 881 T3A = T3w + T3z;
Chris@42 882 T3Z = T3z - T3w;
Chris@42 883 T2T = T1b + T1k;
Chris@42 884 T2W = T2U + T2V;
Chris@42 885 T2X = T2T + T2W;
Chris@42 886 T37 = T2T - T2W;
Chris@42 887 }
Chris@42 888 {
Chris@42 889 E T3i, T3j, T2I, T2L;
Chris@42 890 T3i = T3g - T3h;
Chris@42 891 T3j = T2Q - T2P;
Chris@42 892 T3k = T3i + T3j;
Chris@42 893 T41 = T3i - T3j;
Chris@42 894 T2I = T1F + T1K;
Chris@42 895 T2L = T2J + T2K;
Chris@42 896 T2M = T2I + T2L;
Chris@42 897 T39 = T2I - T2L;
Chris@42 898 }
Chris@42 899 {
Chris@42 900 E T3t, T3u, T2O, T2R;
Chris@42 901 T3t = T3r - T3s;
Chris@42 902 T3u = T2K - T2J;
Chris@42 903 T3v = T3t + T3u;
Chris@42 904 T3Y = T3t - T3u;
Chris@42 905 T2O = TC + TH;
Chris@42 906 T2R = T2P + T2Q;
Chris@42 907 T2S = T2O + T2R;
Chris@42 908 T36 = T2O - T2R;
Chris@42 909 }
Chris@42 910 T3n = T3l - T3m;
Chris@42 911 T3o = T2U - T2V;
Chris@42 912 T3p = T3n + T3o;
Chris@42 913 T42 = T3n - T3o;
Chris@42 914 {
Chris@42 915 E Tc, T3M, T4, T8;
Chris@42 916 T4 = W[18];
Chris@42 917 T8 = W[19];
Chris@42 918 Tc = FNMS(T8, Tb, T4 * T7);
Chris@42 919 T3M = FMA(T4, Tb, T8 * T7);
Chris@42 920 Td = T3 - Tc;
Chris@42 921 T4G = T3L + T3M;
Chris@42 922 T33 = Tc + T3;
Chris@42 923 T3N = T3L - T3M;
Chris@42 924 }
Chris@42 925 {
Chris@42 926 E Tm, T30, Tv, T31;
Chris@42 927 {
Chris@42 928 E Te, Ti, Tn, Tr;
Chris@42 929 Te = W[8];
Chris@42 930 Ti = W[9];
Chris@42 931 Tm = FNMS(Ti, Tl, Te * Th);
Chris@42 932 T30 = FMA(Ti, Th, Te * Tl);
Chris@42 933 Tn = W[28];
Chris@42 934 Tr = W[29];
Chris@42 935 Tv = FNMS(Tr, Tu, Tn * Tq);
Chris@42 936 T31 = FMA(Tr, Tq, Tn * Tu);
Chris@42 937 }
Chris@42 938 Tw = Tm - Tv;
Chris@42 939 T4H = Tm + Tv;
Chris@42 940 T32 = T30 + T31;
Chris@42 941 T3O = T31 - T30;
Chris@42 942 }
Chris@42 943 }
Chris@42 944 }
Chris@42 945 {
Chris@42 946 E T3C, T3E, Tx, T2u, T3d, T3e, T3D, T3f;
Chris@42 947 {
Chris@42 948 E T3q, T3B, T1A, T2t;
Chris@42 949 T3q = T3k - T3p;
Chris@42 950 T3B = T3v - T3A;
Chris@42 951 T3C = FMA(KP475528258, T3q, KP293892626 * T3B);
Chris@42 952 T3E = FNMS(KP293892626, T3q, KP475528258 * T3B);
Chris@42 953 Tx = Td - Tw;
Chris@42 954 T1A = T12 + T1z;
Chris@42 955 T2t = T25 + T2s;
Chris@42 956 T2u = T1A + T2t;
Chris@42 957 T3d = KP279508497 * (T1A - T2t);
Chris@42 958 T3e = FNMS(KP125000000, T2u, KP500000000 * Tx);
Chris@42 959 }
Chris@42 960 Ip[WS(rs, 5)] = KP500000000 * (Tx + T2u);
Chris@42 961 T3D = T3d - T3e;
Chris@42 962 Im[WS(rs, 2)] = T3D - T3E;
Chris@42 963 Im[WS(rs, 6)] = T3D + T3E;
Chris@42 964 T3f = T3d + T3e;
Chris@42 965 Ip[WS(rs, 1)] = T3f - T3C;
Chris@42 966 Ip[WS(rs, 9)] = T3f + T3C;
Chris@42 967 }
Chris@42 968 {
Chris@42 969 E T3H, T3T, T3P, T3Q, T3K, T3R, T3U, T3S;
Chris@42 970 {
Chris@42 971 E T3F, T3G, T3I, T3J;
Chris@42 972 T3F = T12 - T1z;
Chris@42 973 T3G = T25 - T2s;
Chris@42 974 T3H = FMA(KP475528258, T3F, KP293892626 * T3G);
Chris@42 975 T3T = FNMS(KP293892626, T3F, KP475528258 * T3G);
Chris@42 976 T3P = T3N + T3O;
Chris@42 977 T3I = T3k + T3p;
Chris@42 978 T3J = T3v + T3A;
Chris@42 979 T3Q = T3I + T3J;
Chris@42 980 T3K = KP279508497 * (T3I - T3J);
Chris@42 981 T3R = FNMS(KP125000000, T3Q, KP500000000 * T3P);
Chris@42 982 }
Chris@42 983 Rp[WS(rs, 5)] = KP500000000 * (T3P + T3Q);
Chris@42 984 T3U = T3R - T3K;
Chris@42 985 Rm[WS(rs, 6)] = T3T + T3U;
Chris@42 986 Rm[WS(rs, 2)] = T3U - T3T;
Chris@42 987 T3S = T3K + T3R;
Chris@42 988 Rp[WS(rs, 1)] = T3H + T3S;
Chris@42 989 Rp[WS(rs, 9)] = T3S - T3H;
Chris@42 990 }
Chris@42 991 {
Chris@42 992 E T44, T46, T2C, T2B, T3V, T3W, T45, T3X;
Chris@42 993 {
Chris@42 994 E T40, T43, T2x, T2A;
Chris@42 995 T40 = T3Y - T3Z;
Chris@42 996 T43 = T41 - T42;
Chris@42 997 T44 = FNMS(KP293892626, T43, KP475528258 * T40);
Chris@42 998 T46 = FMA(KP475528258, T43, KP293892626 * T40);
Chris@42 999 T2C = Tw + Td;
Chris@42 1000 T2x = T2v - T2w;
Chris@42 1001 T2A = T2y + T2z;
Chris@42 1002 T2B = T2x - T2A;
Chris@42 1003 T3V = FMA(KP500000000, T2C, KP125000000 * T2B);
Chris@42 1004 T3W = KP279508497 * (T2x + T2A);
Chris@42 1005 }
Chris@42 1006 Im[WS(rs, 4)] = KP500000000 * (T2B - T2C);
Chris@42 1007 T45 = T3W - T3V;
Chris@42 1008 Im[0] = T45 - T46;
Chris@42 1009 Im[WS(rs, 8)] = T45 + T46;
Chris@42 1010 T3X = T3V + T3W;
Chris@42 1011 Ip[WS(rs, 3)] = T3X - T44;
Chris@42 1012 Ip[WS(rs, 7)] = T3X + T44;
Chris@42 1013 }
Chris@42 1014 {
Chris@42 1015 E T49, T4h, T4a, T4d, T4e, T4f, T4i, T4g;
Chris@42 1016 {
Chris@42 1017 E T47, T48, T4b, T4c;
Chris@42 1018 T47 = T2y - T2z;
Chris@42 1019 T48 = T2w + T2v;
Chris@42 1020 T49 = FNMS(KP293892626, T48, KP475528258 * T47);
Chris@42 1021 T4h = FMA(KP475528258, T48, KP293892626 * T47);
Chris@42 1022 T4a = T3N - T3O;
Chris@42 1023 T4b = T41 + T42;
Chris@42 1024 T4c = T3Y + T3Z;
Chris@42 1025 T4d = T4b + T4c;
Chris@42 1026 T4e = FNMS(KP125000000, T4d, KP500000000 * T4a);
Chris@42 1027 T4f = KP279508497 * (T4b - T4c);
Chris@42 1028 }
Chris@42 1029 Rm[WS(rs, 4)] = KP500000000 * (T4a + T4d);
Chris@42 1030 T4i = T4f + T4e;
Chris@42 1031 Rm[WS(rs, 8)] = T4h + T4i;
Chris@42 1032 Rm[0] = T4i - T4h;
Chris@42 1033 T4g = T4e - T4f;
Chris@42 1034 Rp[WS(rs, 3)] = T49 + T4g;
Chris@42 1035 Rp[WS(rs, 7)] = T4g - T49;
Chris@42 1036 }
Chris@42 1037 {
Chris@42 1038 E T50, T52, T34, T2Z, T4R, T4S, T51, T4T;
Chris@42 1039 {
Chris@42 1040 E T4W, T4Z, T2N, T2Y;
Chris@42 1041 T4W = T4U - T4V;
Chris@42 1042 T4Z = T4X - T4Y;
Chris@42 1043 T50 = FNMS(KP293892626, T4Z, KP475528258 * T4W);
Chris@42 1044 T52 = FMA(KP293892626, T4W, KP475528258 * T4Z);
Chris@42 1045 T34 = T32 + T33;
Chris@42 1046 T2N = T2H - T2M;
Chris@42 1047 T2Y = T2S + T2X;
Chris@42 1048 T2Z = T2N - T2Y;
Chris@42 1049 T4R = FMA(KP500000000, T34, KP125000000 * T2Z);
Chris@42 1050 T4S = KP279508497 * (T2Y + T2N);
Chris@42 1051 }
Chris@42 1052 Im[WS(rs, 9)] = KP500000000 * (T2Z - T34);
Chris@42 1053 T51 = T4R - T4S;
Chris@42 1054 Ip[WS(rs, 2)] = T51 + T52;
Chris@42 1055 Im[WS(rs, 1)] = T52 - T51;
Chris@42 1056 T4T = T4R + T4S;
Chris@42 1057 Ip[WS(rs, 6)] = T4T + T50;
Chris@42 1058 Im[WS(rs, 5)] = T50 - T4T;
Chris@42 1059 }
Chris@42 1060 {
Chris@42 1061 E T5c, T5d, T53, T56, T57, T58, T5e, T59;
Chris@42 1062 {
Chris@42 1063 E T5a, T5b, T54, T55;
Chris@42 1064 T5a = T2M + T2H;
Chris@42 1065 T5b = T2S - T2X;
Chris@42 1066 T5c = FNMS(KP293892626, T5b, KP475528258 * T5a);
Chris@42 1067 T5d = FMA(KP475528258, T5b, KP293892626 * T5a);
Chris@42 1068 T53 = T4G - T4H;
Chris@42 1069 T54 = T4V + T4U;
Chris@42 1070 T55 = T4X + T4Y;
Chris@42 1071 T56 = T54 + T55;
Chris@42 1072 T57 = FNMS(KP125000000, T56, KP500000000 * T53);
Chris@42 1073 T58 = KP279508497 * (T54 - T55);
Chris@42 1074 }
Chris@42 1075 Rm[WS(rs, 9)] = KP500000000 * (T53 + T56);
Chris@42 1076 T5e = T58 + T57;
Chris@42 1077 Rp[WS(rs, 6)] = T5d + T5e;
Chris@42 1078 Rm[WS(rs, 5)] = T5e - T5d;
Chris@42 1079 T59 = T57 - T58;
Chris@42 1080 Rp[WS(rs, 2)] = T59 - T5c;
Chris@42 1081 Rm[WS(rs, 1)] = T5c + T59;
Chris@42 1082 }
Chris@42 1083 {
Chris@42 1084 E T4A, T4C, T35, T3c, T4j, T4k, T4B, T4l;
Chris@42 1085 {
Chris@42 1086 E T4s, T4z, T38, T3b;
Chris@42 1087 T4s = T4o - T4r;
Chris@42 1088 T4z = T4v - T4y;
Chris@42 1089 T4A = FNMS(KP475528258, T4z, KP293892626 * T4s);
Chris@42 1090 T4C = FMA(KP475528258, T4s, KP293892626 * T4z);
Chris@42 1091 T35 = T33 - T32;
Chris@42 1092 T38 = T36 + T37;
Chris@42 1093 T3b = T39 + T3a;
Chris@42 1094 T3c = T38 + T3b;
Chris@42 1095 T4j = FNMS(KP125000000, T3c, KP500000000 * T35);
Chris@42 1096 T4k = KP279508497 * (T38 - T3b);
Chris@42 1097 }
Chris@42 1098 Ip[0] = KP500000000 * (T35 + T3c);
Chris@42 1099 T4B = T4k + T4j;
Chris@42 1100 Ip[WS(rs, 4)] = T4B + T4C;
Chris@42 1101 Im[WS(rs, 3)] = T4C - T4B;
Chris@42 1102 T4l = T4j - T4k;
Chris@42 1103 Ip[WS(rs, 8)] = T4l + T4A;
Chris@42 1104 Im[WS(rs, 7)] = T4A - T4l;
Chris@42 1105 }
Chris@42 1106 {
Chris@42 1107 E T4O, T4P, T4I, T4J, T4F, T4K, T4Q, T4L;
Chris@42 1108 {
Chris@42 1109 E T4M, T4N, T4D, T4E;
Chris@42 1110 T4M = T36 - T37;
Chris@42 1111 T4N = T39 - T3a;
Chris@42 1112 T4O = FMA(KP475528258, T4M, KP293892626 * T4N);
Chris@42 1113 T4P = FNMS(KP293892626, T4M, KP475528258 * T4N);
Chris@42 1114 T4I = T4G + T4H;
Chris@42 1115 T4D = T4o + T4r;
Chris@42 1116 T4E = T4v + T4y;
Chris@42 1117 T4J = T4D + T4E;
Chris@42 1118 T4F = KP279508497 * (T4D - T4E);
Chris@42 1119 T4K = FNMS(KP125000000, T4J, KP500000000 * T4I);
Chris@42 1120 }
Chris@42 1121 Rp[0] = KP500000000 * (T4I + T4J);
Chris@42 1122 T4Q = T4K - T4F;
Chris@42 1123 Rp[WS(rs, 8)] = T4P + T4Q;
Chris@42 1124 Rm[WS(rs, 7)] = T4Q - T4P;
Chris@42 1125 T4L = T4F + T4K;
Chris@42 1126 Rp[WS(rs, 4)] = T4L - T4O;
Chris@42 1127 Rm[WS(rs, 3)] = T4O + T4L;
Chris@42 1128 }
Chris@42 1129 }
Chris@42 1130 }
Chris@42 1131 }
Chris@42 1132
Chris@42 1133 static const tw_instr twinstr[] = {
Chris@42 1134 {TW_FULL, 1, 20},
Chris@42 1135 {TW_NEXT, 1, 0}
Chris@42 1136 };
Chris@42 1137
Chris@42 1138 static const hc2c_desc desc = { 20, "hc2cfdft_20", twinstr, &GENUS, {224, 78, 62, 0} };
Chris@42 1139
Chris@42 1140 void X(codelet_hc2cfdft_20) (planner *p) {
Chris@42 1141 X(khc2c_register) (p, hc2cfdft_20, &desc, HC2C_VIA_DFT);
Chris@42 1142 }
Chris@42 1143 #endif /* HAVE_FMA */