annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cf_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:23 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cf_20 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 246 FP additions, 148 FP multiplications,
Chris@42 32 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
Chris@42 33 * 97 stack variables, 4 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cf_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 46 E T4P, T4Y, T50, T4U, T4S, T4T, T4Z, T4V;
Chris@42 47 {
Chris@42 48 E T4N, T4r, T8, T2i, T4n, T2n, T4O, Tl, T2v, T3v, T3T, T4f, TN, T2b, T3F;
Chris@42 49 E T3p, T2R, T3z, T43, T4b, T27, T2f, T3J, T33, T2K, T3y, T40, T4c, T1G, T2e;
Chris@42 50 E T3I, T3a, T2C, T3w, T3W, T4e, T1e, T2c, T3G, T3i;
Chris@42 51 {
Chris@42 52 E T1, T4q, T3, T6, T2, T5;
Chris@42 53 T1 = Rp[0];
Chris@42 54 T4q = Rm[0];
Chris@42 55 T3 = Rp[WS(rs, 5)];
Chris@42 56 T6 = Rm[WS(rs, 5)];
Chris@42 57 T2 = W[18];
Chris@42 58 T5 = W[19];
Chris@42 59 {
Chris@42 60 E Ta, Td, Tg, T2j, Tb, Tj, Tf, Tc, Ti;
Chris@42 61 {
Chris@42 62 E T4o, T4, T9, T4p, T7;
Chris@42 63 Ta = Ip[WS(rs, 2)];
Chris@42 64 Td = Im[WS(rs, 2)];
Chris@42 65 T4o = T2 * T6;
Chris@42 66 T4 = T2 * T3;
Chris@42 67 T9 = W[8];
Chris@42 68 Tg = Ip[WS(rs, 7)];
Chris@42 69 T4p = FNMS(T5, T3, T4o);
Chris@42 70 T7 = FMA(T5, T6, T4);
Chris@42 71 T2j = T9 * Td;
Chris@42 72 Tb = T9 * Ta;
Chris@42 73 T4N = T4q - T4p;
Chris@42 74 T4r = T4p + T4q;
Chris@42 75 T8 = T1 + T7;
Chris@42 76 T2i = T1 - T7;
Chris@42 77 Tj = Im[WS(rs, 7)];
Chris@42 78 Tf = W[28];
Chris@42 79 }
Chris@42 80 Tc = W[9];
Chris@42 81 Ti = W[29];
Chris@42 82 {
Chris@42 83 E T3l, Ts, T2t, TL, TB, TE, TD, T3n, Ty, T2q, TC;
Chris@42 84 {
Chris@42 85 E TH, TK, TJ, T2s, TI;
Chris@42 86 {
Chris@42 87 E To, Tr, Tp, T3k, Tq, TG;
Chris@42 88 {
Chris@42 89 E T2k, Te, T2m, Tk, T2l, Th, Tn;
Chris@42 90 To = Rp[WS(rs, 2)];
Chris@42 91 T2l = Tf * Tj;
Chris@42 92 Th = Tf * Tg;
Chris@42 93 T2k = FNMS(Tc, Ta, T2j);
Chris@42 94 Te = FMA(Tc, Td, Tb);
Chris@42 95 T2m = FNMS(Ti, Tg, T2l);
Chris@42 96 Tk = FMA(Ti, Tj, Th);
Chris@42 97 Tr = Rm[WS(rs, 2)];
Chris@42 98 Tn = W[6];
Chris@42 99 T4n = T2k + T2m;
Chris@42 100 T2n = T2k - T2m;
Chris@42 101 T4O = Te - Tk;
Chris@42 102 Tl = Te + Tk;
Chris@42 103 Tp = Tn * To;
Chris@42 104 T3k = Tn * Tr;
Chris@42 105 }
Chris@42 106 Tq = W[7];
Chris@42 107 TH = Ip[WS(rs, 9)];
Chris@42 108 TK = Im[WS(rs, 9)];
Chris@42 109 TG = W[36];
Chris@42 110 T3l = FNMS(Tq, To, T3k);
Chris@42 111 Ts = FMA(Tq, Tr, Tp);
Chris@42 112 TJ = W[37];
Chris@42 113 T2s = TG * TK;
Chris@42 114 TI = TG * TH;
Chris@42 115 }
Chris@42 116 {
Chris@42 117 E Tu, Tx, Tt, Tw, T3m, Tv, TA;
Chris@42 118 Tu = Rp[WS(rs, 7)];
Chris@42 119 Tx = Rm[WS(rs, 7)];
Chris@42 120 T2t = FNMS(TJ, TH, T2s);
Chris@42 121 TL = FMA(TJ, TK, TI);
Chris@42 122 Tt = W[26];
Chris@42 123 Tw = W[27];
Chris@42 124 TB = Ip[WS(rs, 4)];
Chris@42 125 TE = Im[WS(rs, 4)];
Chris@42 126 T3m = Tt * Tx;
Chris@42 127 Tv = Tt * Tu;
Chris@42 128 TA = W[16];
Chris@42 129 TD = W[17];
Chris@42 130 T3n = FNMS(Tw, Tu, T3m);
Chris@42 131 Ty = FMA(Tw, Tx, Tv);
Chris@42 132 T2q = TA * TE;
Chris@42 133 TC = TA * TB;
Chris@42 134 }
Chris@42 135 }
Chris@42 136 {
Chris@42 137 E T3o, T3R, Tz, T2p, T2r, TF;
Chris@42 138 T3o = T3l - T3n;
Chris@42 139 T3R = T3l + T3n;
Chris@42 140 Tz = Ts + Ty;
Chris@42 141 T2p = Ts - Ty;
Chris@42 142 T2r = FNMS(TD, TB, T2q);
Chris@42 143 TF = FMA(TD, TE, TC);
Chris@42 144 {
Chris@42 145 E T3S, T2u, TM, T3j;
Chris@42 146 T3S = T2r + T2t;
Chris@42 147 T2u = T2r - T2t;
Chris@42 148 TM = TF + TL;
Chris@42 149 T3j = TL - TF;
Chris@42 150 T2v = T2p - T2u;
Chris@42 151 T3v = T2p + T2u;
Chris@42 152 T3T = T3R + T3S;
Chris@42 153 T4f = T3S - T3R;
Chris@42 154 TN = Tz - TM;
Chris@42 155 T2b = Tz + TM;
Chris@42 156 T3F = T3o + T3j;
Chris@42 157 T3p = T3j - T3o;
Chris@42 158 }
Chris@42 159 }
Chris@42 160 }
Chris@42 161 }
Chris@42 162 }
Chris@42 163 {
Chris@42 164 E T2Z, T1M, T2P, T25, T1V, T1Y, T1X, T31, T1S, T2M, T1W;
Chris@42 165 {
Chris@42 166 E T21, T24, T23, T2O, T22;
Chris@42 167 {
Chris@42 168 E T1I, T1L, T1H, T1K, T2Y, T1J, T20;
Chris@42 169 T1I = Rp[WS(rs, 6)];
Chris@42 170 T1L = Rm[WS(rs, 6)];
Chris@42 171 T1H = W[22];
Chris@42 172 T1K = W[23];
Chris@42 173 T21 = Ip[WS(rs, 3)];
Chris@42 174 T24 = Im[WS(rs, 3)];
Chris@42 175 T2Y = T1H * T1L;
Chris@42 176 T1J = T1H * T1I;
Chris@42 177 T20 = W[12];
Chris@42 178 T23 = W[13];
Chris@42 179 T2Z = FNMS(T1K, T1I, T2Y);
Chris@42 180 T1M = FMA(T1K, T1L, T1J);
Chris@42 181 T2O = T20 * T24;
Chris@42 182 T22 = T20 * T21;
Chris@42 183 }
Chris@42 184 {
Chris@42 185 E T1O, T1R, T1N, T1Q, T30, T1P, T1U;
Chris@42 186 T1O = Rp[WS(rs, 1)];
Chris@42 187 T1R = Rm[WS(rs, 1)];
Chris@42 188 T2P = FNMS(T23, T21, T2O);
Chris@42 189 T25 = FMA(T23, T24, T22);
Chris@42 190 T1N = W[2];
Chris@42 191 T1Q = W[3];
Chris@42 192 T1V = Ip[WS(rs, 8)];
Chris@42 193 T1Y = Im[WS(rs, 8)];
Chris@42 194 T30 = T1N * T1R;
Chris@42 195 T1P = T1N * T1O;
Chris@42 196 T1U = W[32];
Chris@42 197 T1X = W[33];
Chris@42 198 T31 = FNMS(T1Q, T1O, T30);
Chris@42 199 T1S = FMA(T1Q, T1R, T1P);
Chris@42 200 T2M = T1U * T1Y;
Chris@42 201 T1W = T1U * T1V;
Chris@42 202 }
Chris@42 203 }
Chris@42 204 {
Chris@42 205 E T32, T41, T1T, T2L, T2N, T1Z;
Chris@42 206 T32 = T2Z - T31;
Chris@42 207 T41 = T2Z + T31;
Chris@42 208 T1T = T1M + T1S;
Chris@42 209 T2L = T1M - T1S;
Chris@42 210 T2N = FNMS(T1X, T1V, T2M);
Chris@42 211 T1Z = FMA(T1X, T1Y, T1W);
Chris@42 212 {
Chris@42 213 E T42, T2Q, T26, T2X;
Chris@42 214 T42 = T2N + T2P;
Chris@42 215 T2Q = T2N - T2P;
Chris@42 216 T26 = T1Z + T25;
Chris@42 217 T2X = T25 - T1Z;
Chris@42 218 T2R = T2L - T2Q;
Chris@42 219 T3z = T2L + T2Q;
Chris@42 220 T43 = T41 + T42;
Chris@42 221 T4b = T42 - T41;
Chris@42 222 T27 = T1T - T26;
Chris@42 223 T2f = T1T + T26;
Chris@42 224 T3J = T32 + T2X;
Chris@42 225 T33 = T2X - T32;
Chris@42 226 }
Chris@42 227 }
Chris@42 228 }
Chris@42 229 {
Chris@42 230 E T36, T1l, T2I, T1E, T1u, T1x, T1w, T38, T1r, T2F, T1v;
Chris@42 231 {
Chris@42 232 E T1A, T1D, T1C, T2H, T1B;
Chris@42 233 {
Chris@42 234 E T1h, T1k, T1g, T1j, T35, T1i, T1z;
Chris@42 235 T1h = Rp[WS(rs, 4)];
Chris@42 236 T1k = Rm[WS(rs, 4)];
Chris@42 237 T1g = W[14];
Chris@42 238 T1j = W[15];
Chris@42 239 T1A = Ip[WS(rs, 1)];
Chris@42 240 T1D = Im[WS(rs, 1)];
Chris@42 241 T35 = T1g * T1k;
Chris@42 242 T1i = T1g * T1h;
Chris@42 243 T1z = W[4];
Chris@42 244 T1C = W[5];
Chris@42 245 T36 = FNMS(T1j, T1h, T35);
Chris@42 246 T1l = FMA(T1j, T1k, T1i);
Chris@42 247 T2H = T1z * T1D;
Chris@42 248 T1B = T1z * T1A;
Chris@42 249 }
Chris@42 250 {
Chris@42 251 E T1n, T1q, T1m, T1p, T37, T1o, T1t;
Chris@42 252 T1n = Rp[WS(rs, 9)];
Chris@42 253 T1q = Rm[WS(rs, 9)];
Chris@42 254 T2I = FNMS(T1C, T1A, T2H);
Chris@42 255 T1E = FMA(T1C, T1D, T1B);
Chris@42 256 T1m = W[34];
Chris@42 257 T1p = W[35];
Chris@42 258 T1u = Ip[WS(rs, 6)];
Chris@42 259 T1x = Im[WS(rs, 6)];
Chris@42 260 T37 = T1m * T1q;
Chris@42 261 T1o = T1m * T1n;
Chris@42 262 T1t = W[24];
Chris@42 263 T1w = W[25];
Chris@42 264 T38 = FNMS(T1p, T1n, T37);
Chris@42 265 T1r = FMA(T1p, T1q, T1o);
Chris@42 266 T2F = T1t * T1x;
Chris@42 267 T1v = T1t * T1u;
Chris@42 268 }
Chris@42 269 }
Chris@42 270 {
Chris@42 271 E T39, T3Y, T1s, T2E, T2G, T1y;
Chris@42 272 T39 = T36 - T38;
Chris@42 273 T3Y = T36 + T38;
Chris@42 274 T1s = T1l + T1r;
Chris@42 275 T2E = T1l - T1r;
Chris@42 276 T2G = FNMS(T1w, T1u, T2F);
Chris@42 277 T1y = FMA(T1w, T1x, T1v);
Chris@42 278 {
Chris@42 279 E T3Z, T2J, T1F, T34;
Chris@42 280 T3Z = T2G + T2I;
Chris@42 281 T2J = T2G - T2I;
Chris@42 282 T1F = T1y + T1E;
Chris@42 283 T34 = T1E - T1y;
Chris@42 284 T2K = T2E - T2J;
Chris@42 285 T3y = T2E + T2J;
Chris@42 286 T40 = T3Y + T3Z;
Chris@42 287 T4c = T3Z - T3Y;
Chris@42 288 T1G = T1s - T1F;
Chris@42 289 T2e = T1s + T1F;
Chris@42 290 T3I = T39 + T34;
Chris@42 291 T3a = T34 - T39;
Chris@42 292 }
Chris@42 293 }
Chris@42 294 }
Chris@42 295 {
Chris@42 296 E T3e, TT, T2A, T1c, T12, T15, T14, T3g, TZ, T2x, T13;
Chris@42 297 {
Chris@42 298 E T18, T1b, T1a, T2z, T19;
Chris@42 299 {
Chris@42 300 E TP, TS, TO, TR, T3d, TQ, T17;
Chris@42 301 TP = Rp[WS(rs, 8)];
Chris@42 302 TS = Rm[WS(rs, 8)];
Chris@42 303 TO = W[30];
Chris@42 304 TR = W[31];
Chris@42 305 T18 = Ip[WS(rs, 5)];
Chris@42 306 T1b = Im[WS(rs, 5)];
Chris@42 307 T3d = TO * TS;
Chris@42 308 TQ = TO * TP;
Chris@42 309 T17 = W[20];
Chris@42 310 T1a = W[21];
Chris@42 311 T3e = FNMS(TR, TP, T3d);
Chris@42 312 TT = FMA(TR, TS, TQ);
Chris@42 313 T2z = T17 * T1b;
Chris@42 314 T19 = T17 * T18;
Chris@42 315 }
Chris@42 316 {
Chris@42 317 E TV, TY, TU, TX, T3f, TW, T11;
Chris@42 318 TV = Rp[WS(rs, 3)];
Chris@42 319 TY = Rm[WS(rs, 3)];
Chris@42 320 T2A = FNMS(T1a, T18, T2z);
Chris@42 321 T1c = FMA(T1a, T1b, T19);
Chris@42 322 TU = W[10];
Chris@42 323 TX = W[11];
Chris@42 324 T12 = Ip[0];
Chris@42 325 T15 = Im[0];
Chris@42 326 T3f = TU * TY;
Chris@42 327 TW = TU * TV;
Chris@42 328 T11 = W[0];
Chris@42 329 T14 = W[1];
Chris@42 330 T3g = FNMS(TX, TV, T3f);
Chris@42 331 TZ = FMA(TX, TY, TW);
Chris@42 332 T2x = T11 * T15;
Chris@42 333 T13 = T11 * T12;
Chris@42 334 }
Chris@42 335 }
Chris@42 336 {
Chris@42 337 E T3h, T3U, T10, T2w, T2y, T16;
Chris@42 338 T3h = T3e - T3g;
Chris@42 339 T3U = T3e + T3g;
Chris@42 340 T10 = TT + TZ;
Chris@42 341 T2w = TT - TZ;
Chris@42 342 T2y = FNMS(T14, T12, T2x);
Chris@42 343 T16 = FMA(T14, T15, T13);
Chris@42 344 {
Chris@42 345 E T3V, T2B, T1d, T3c;
Chris@42 346 T3V = T2y + T2A;
Chris@42 347 T2B = T2y - T2A;
Chris@42 348 T1d = T16 + T1c;
Chris@42 349 T3c = T1c - T16;
Chris@42 350 T2C = T2w - T2B;
Chris@42 351 T3w = T2w + T2B;
Chris@42 352 T3W = T3U + T3V;
Chris@42 353 T4e = T3V - T3U;
Chris@42 354 T1e = T10 - T1d;
Chris@42 355 T2c = T10 + T1d;
Chris@42 356 T3G = T3h + T3c;
Chris@42 357 T3i = T3c - T3h;
Chris@42 358 }
Chris@42 359 }
Chris@42 360 }
Chris@42 361 {
Chris@42 362 E T4s, T4k, T4l, T45, T47, T3P, T4y, T4A, T3O;
Chris@42 363 {
Chris@42 364 E T4C, T4B, T2a, T4j, T4h, T4E, T4M, T4K, T4i, T4a;
Chris@42 365 {
Chris@42 366 E Tm, T1f, T4J, T4I, T28, T4d, T4g, T29, T49, T48;
Chris@42 367 T4C = T4c + T4b;
Chris@42 368 T4d = T4b - T4c;
Chris@42 369 T4g = T4e - T4f;
Chris@42 370 T4B = T4f + T4e;
Chris@42 371 T2a = T8 + Tl;
Chris@42 372 Tm = T8 - Tl;
Chris@42 373 T1f = TN + T1e;
Chris@42 374 T4J = T1e - TN;
Chris@42 375 T4I = T1G - T27;
Chris@42 376 T28 = T1G + T27;
Chris@42 377 T4j = FMA(KP618033988, T4d, T4g);
Chris@42 378 T4h = FNMS(KP618033988, T4g, T4d);
Chris@42 379 T29 = T1f + T28;
Chris@42 380 T49 = T1f - T28;
Chris@42 381 T4E = T4r - T4n;
Chris@42 382 T4s = T4n + T4r;
Chris@42 383 Rm[WS(rs, 9)] = Tm + T29;
Chris@42 384 T48 = FNMS(KP250000000, T29, Tm);
Chris@42 385 T4M = FNMS(KP618033988, T4I, T4J);
Chris@42 386 T4K = FMA(KP618033988, T4J, T4I);
Chris@42 387 T4i = FMA(KP559016994, T49, T48);
Chris@42 388 T4a = FNMS(KP559016994, T49, T48);
Chris@42 389 }
Chris@42 390 {
Chris@42 391 E T2d, T4w, T4x, T2g, T2h;
Chris@42 392 {
Chris@42 393 E T3X, T4G, T4F, T44, T4D, T4L, T4H;
Chris@42 394 T4k = T3T + T3W;
Chris@42 395 T3X = T3T - T3W;
Chris@42 396 T4G = T4C - T4B;
Chris@42 397 T4D = T4B + T4C;
Chris@42 398 Rm[WS(rs, 1)] = FMA(KP951056516, T4h, T4a);
Chris@42 399 Rp[WS(rs, 2)] = FNMS(KP951056516, T4h, T4a);
Chris@42 400 Rp[WS(rs, 6)] = FMA(KP951056516, T4j, T4i);
Chris@42 401 Rm[WS(rs, 5)] = FNMS(KP951056516, T4j, T4i);
Chris@42 402 Im[WS(rs, 9)] = T4D - T4E;
Chris@42 403 T4F = FMA(KP250000000, T4D, T4E);
Chris@42 404 T44 = T40 - T43;
Chris@42 405 T4l = T40 + T43;
Chris@42 406 T2d = T2b + T2c;
Chris@42 407 T4w = T2b - T2c;
Chris@42 408 T4L = FMA(KP559016994, T4G, T4F);
Chris@42 409 T4H = FNMS(KP559016994, T4G, T4F);
Chris@42 410 T45 = FMA(KP618033988, T44, T3X);
Chris@42 411 T47 = FNMS(KP618033988, T3X, T44);
Chris@42 412 Ip[WS(rs, 2)] = FMA(KP951056516, T4K, T4H);
Chris@42 413 Im[WS(rs, 1)] = FMS(KP951056516, T4K, T4H);
Chris@42 414 Ip[WS(rs, 6)] = FMA(KP951056516, T4M, T4L);
Chris@42 415 Im[WS(rs, 5)] = FMS(KP951056516, T4M, T4L);
Chris@42 416 T4x = T2f - T2e;
Chris@42 417 T2g = T2e + T2f;
Chris@42 418 }
Chris@42 419 T2h = T2d + T2g;
Chris@42 420 T3P = T2d - T2g;
Chris@42 421 T4y = FNMS(KP618033988, T4x, T4w);
Chris@42 422 T4A = FMA(KP618033988, T4w, T4x);
Chris@42 423 Rp[0] = T2a + T2h;
Chris@42 424 T3O = FNMS(KP250000000, T2h, T2a);
Chris@42 425 }
Chris@42 426 }
Chris@42 427 {
Chris@42 428 E T3u, T54, T5a, T5c, T56, T53;
Chris@42 429 {
Chris@42 430 E T52, T51, T3t, T3r, T2o, T58, T59, T2T, T2V, T4u, T4t, T2U, T3s, T2W;
Chris@42 431 {
Chris@42 432 E T3b, T3q, T46, T3Q, T4m;
Chris@42 433 T52 = T3a + T33;
Chris@42 434 T3b = T33 - T3a;
Chris@42 435 T3q = T3i - T3p;
Chris@42 436 T51 = T3p + T3i;
Chris@42 437 T46 = FNMS(KP559016994, T3P, T3O);
Chris@42 438 T3Q = FMA(KP559016994, T3P, T3O);
Chris@42 439 T4m = T4k + T4l;
Chris@42 440 T4u = T4k - T4l;
Chris@42 441 Rm[WS(rs, 3)] = FMA(KP951056516, T45, T3Q);
Chris@42 442 Rp[WS(rs, 4)] = FNMS(KP951056516, T45, T3Q);
Chris@42 443 Rp[WS(rs, 8)] = FMA(KP951056516, T47, T46);
Chris@42 444 Rm[WS(rs, 7)] = FNMS(KP951056516, T47, T46);
Chris@42 445 Ip[0] = T4m + T4s;
Chris@42 446 T4t = FNMS(KP250000000, T4m, T4s);
Chris@42 447 T3t = FMA(KP618033988, T3b, T3q);
Chris@42 448 T3r = FNMS(KP618033988, T3q, T3b);
Chris@42 449 }
Chris@42 450 T3u = T2i + T2n;
Chris@42 451 T2o = T2i - T2n;
Chris@42 452 {
Chris@42 453 E T4v, T4z, T2D, T2S;
Chris@42 454 T4v = FMA(KP559016994, T4u, T4t);
Chris@42 455 T4z = FNMS(KP559016994, T4u, T4t);
Chris@42 456 T2D = T2v + T2C;
Chris@42 457 T58 = T2v - T2C;
Chris@42 458 T59 = T2K - T2R;
Chris@42 459 T2S = T2K + T2R;
Chris@42 460 Ip[WS(rs, 4)] = FMA(KP951056516, T4y, T4v);
Chris@42 461 Im[WS(rs, 3)] = FMS(KP951056516, T4y, T4v);
Chris@42 462 Ip[WS(rs, 8)] = FMA(KP951056516, T4A, T4z);
Chris@42 463 Im[WS(rs, 7)] = FMS(KP951056516, T4A, T4z);
Chris@42 464 T2T = T2D + T2S;
Chris@42 465 T2V = T2D - T2S;
Chris@42 466 }
Chris@42 467 Rm[WS(rs, 4)] = T2o + T2T;
Chris@42 468 T2U = FNMS(KP250000000, T2T, T2o);
Chris@42 469 T54 = T4O + T4N;
Chris@42 470 T4P = T4N - T4O;
Chris@42 471 T5a = FMA(KP618033988, T59, T58);
Chris@42 472 T5c = FNMS(KP618033988, T58, T59);
Chris@42 473 T3s = FMA(KP559016994, T2V, T2U);
Chris@42 474 T2W = FNMS(KP559016994, T2V, T2U);
Chris@42 475 Rp[WS(rs, 7)] = FNMS(KP951056516, T3r, T2W);
Chris@42 476 Rp[WS(rs, 3)] = FMA(KP951056516, T3r, T2W);
Chris@42 477 Rm[0] = FNMS(KP951056516, T3t, T3s);
Chris@42 478 Rm[WS(rs, 8)] = FMA(KP951056516, T3t, T3s);
Chris@42 479 T56 = T51 - T52;
Chris@42 480 T53 = T51 + T52;
Chris@42 481 }
Chris@42 482 {
Chris@42 483 E T4Q, T4R, T3N, T3L, T4X, T4W, T3B, T3D, T3H, T3K, T55, T3C, T3M, T3E;
Chris@42 484 T4Q = T3F + T3G;
Chris@42 485 T3H = T3F - T3G;
Chris@42 486 T3K = T3I - T3J;
Chris@42 487 T4R = T3I + T3J;
Chris@42 488 Im[WS(rs, 4)] = T53 - T54;
Chris@42 489 T55 = FMA(KP250000000, T53, T54);
Chris@42 490 T3N = FNMS(KP618033988, T3H, T3K);
Chris@42 491 T3L = FMA(KP618033988, T3K, T3H);
Chris@42 492 {
Chris@42 493 E T57, T5b, T3x, T3A;
Chris@42 494 T57 = FNMS(KP559016994, T56, T55);
Chris@42 495 T5b = FMA(KP559016994, T56, T55);
Chris@42 496 T3x = T3v + T3w;
Chris@42 497 T4X = T3v - T3w;
Chris@42 498 T4W = T3y - T3z;
Chris@42 499 T3A = T3y + T3z;
Chris@42 500 Im[0] = -(FMA(KP951056516, T5a, T57));
Chris@42 501 Im[WS(rs, 8)] = FMS(KP951056516, T5a, T57);
Chris@42 502 Ip[WS(rs, 7)] = FMA(KP951056516, T5c, T5b);
Chris@42 503 Ip[WS(rs, 3)] = FNMS(KP951056516, T5c, T5b);
Chris@42 504 T3B = T3x + T3A;
Chris@42 505 T3D = T3x - T3A;
Chris@42 506 }
Chris@42 507 Rp[WS(rs, 5)] = T3u + T3B;
Chris@42 508 T3C = FNMS(KP250000000, T3B, T3u);
Chris@42 509 T4Y = FNMS(KP618033988, T4X, T4W);
Chris@42 510 T50 = FMA(KP618033988, T4W, T4X);
Chris@42 511 T3M = FNMS(KP559016994, T3D, T3C);
Chris@42 512 T3E = FMA(KP559016994, T3D, T3C);
Chris@42 513 Rp[WS(rs, 9)] = FNMS(KP951056516, T3L, T3E);
Chris@42 514 Rp[WS(rs, 1)] = FMA(KP951056516, T3L, T3E);
Chris@42 515 Rm[WS(rs, 2)] = FNMS(KP951056516, T3N, T3M);
Chris@42 516 Rm[WS(rs, 6)] = FMA(KP951056516, T3N, T3M);
Chris@42 517 T4U = T4Q - T4R;
Chris@42 518 T4S = T4Q + T4R;
Chris@42 519 }
Chris@42 520 }
Chris@42 521 }
Chris@42 522 }
Chris@42 523 Ip[WS(rs, 5)] = T4S + T4P;
Chris@42 524 T4T = FNMS(KP250000000, T4S, T4P);
Chris@42 525 T4Z = FMA(KP559016994, T4U, T4T);
Chris@42 526 T4V = FNMS(KP559016994, T4U, T4T);
Chris@42 527 Im[WS(rs, 2)] = -(FMA(KP951056516, T4Y, T4V));
Chris@42 528 Im[WS(rs, 6)] = FMS(KP951056516, T4Y, T4V);
Chris@42 529 Ip[WS(rs, 9)] = FMA(KP951056516, T50, T4Z);
Chris@42 530 Ip[WS(rs, 1)] = FNMS(KP951056516, T50, T4Z);
Chris@42 531 }
Chris@42 532 }
Chris@42 533 }
Chris@42 534
Chris@42 535 static const tw_instr twinstr[] = {
Chris@42 536 {TW_FULL, 1, 20},
Chris@42 537 {TW_NEXT, 1, 0}
Chris@42 538 };
Chris@42 539
Chris@42 540 static const hc2c_desc desc = { 20, "hc2cf_20", twinstr, &GENUS, {136, 38, 110, 0} };
Chris@42 541
Chris@42 542 void X(codelet_hc2cf_20) (planner *p) {
Chris@42 543 X(khc2c_register) (p, hc2cf_20, &desc, HC2C_VIA_RDFT);
Chris@42 544 }
Chris@42 545 #else /* HAVE_FMA */
Chris@42 546
Chris@42 547 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cf_20 -include hc2cf.h */
Chris@42 548
Chris@42 549 /*
Chris@42 550 * This function contains 246 FP additions, 124 FP multiplications,
Chris@42 551 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
Chris@42 552 * 85 stack variables, 4 constants, and 80 memory accesses
Chris@42 553 */
Chris@42 554 #include "hc2cf.h"
Chris@42 555
Chris@42 556 static void hc2cf_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 557 {
Chris@42 558 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 559 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 560 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 561 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 562 {
Chris@42 563 INT m;
Chris@42 564 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 565 E Tj, T1R, T4j, T4s, T2q, T37, T3Q, T42, T1r, T1O, T1P, T3p, T3s, T3K, T3A;
Chris@42 566 E T3B, T3Z, T1V, T1W, T1X, T23, T28, T4q, T2W, T2X, T4f, T33, T34, T35, T2G;
Chris@42 567 E T2L, T2M, TG, T13, T14, T3i, T3l, T3J, T3D, T3E, T40, T1S, T1T, T1U, T2e;
Chris@42 568 E T2j, T4p, T2T, T2U, T4e, T30, T31, T32, T2v, T2A, T2B;
Chris@42 569 {
Chris@42 570 E T1, T3O, T6, T3N, Tc, T2n, Th, T2o;
Chris@42 571 T1 = Rp[0];
Chris@42 572 T3O = Rm[0];
Chris@42 573 {
Chris@42 574 E T3, T5, T2, T4;
Chris@42 575 T3 = Rp[WS(rs, 5)];
Chris@42 576 T5 = Rm[WS(rs, 5)];
Chris@42 577 T2 = W[18];
Chris@42 578 T4 = W[19];
Chris@42 579 T6 = FMA(T2, T3, T4 * T5);
Chris@42 580 T3N = FNMS(T4, T3, T2 * T5);
Chris@42 581 }
Chris@42 582 {
Chris@42 583 E T9, Tb, T8, Ta;
Chris@42 584 T9 = Ip[WS(rs, 2)];
Chris@42 585 Tb = Im[WS(rs, 2)];
Chris@42 586 T8 = W[8];
Chris@42 587 Ta = W[9];
Chris@42 588 Tc = FMA(T8, T9, Ta * Tb);
Chris@42 589 T2n = FNMS(Ta, T9, T8 * Tb);
Chris@42 590 }
Chris@42 591 {
Chris@42 592 E Te, Tg, Td, Tf;
Chris@42 593 Te = Ip[WS(rs, 7)];
Chris@42 594 Tg = Im[WS(rs, 7)];
Chris@42 595 Td = W[28];
Chris@42 596 Tf = W[29];
Chris@42 597 Th = FMA(Td, Te, Tf * Tg);
Chris@42 598 T2o = FNMS(Tf, Te, Td * Tg);
Chris@42 599 }
Chris@42 600 {
Chris@42 601 E T7, Ti, T4h, T4i;
Chris@42 602 T7 = T1 + T6;
Chris@42 603 Ti = Tc + Th;
Chris@42 604 Tj = T7 - Ti;
Chris@42 605 T1R = T7 + Ti;
Chris@42 606 T4h = T3O - T3N;
Chris@42 607 T4i = Tc - Th;
Chris@42 608 T4j = T4h - T4i;
Chris@42 609 T4s = T4i + T4h;
Chris@42 610 }
Chris@42 611 {
Chris@42 612 E T2m, T2p, T3M, T3P;
Chris@42 613 T2m = T1 - T6;
Chris@42 614 T2p = T2n - T2o;
Chris@42 615 T2q = T2m - T2p;
Chris@42 616 T37 = T2m + T2p;
Chris@42 617 T3M = T2n + T2o;
Chris@42 618 T3P = T3N + T3O;
Chris@42 619 T3Q = T3M + T3P;
Chris@42 620 T42 = T3P - T3M;
Chris@42 621 }
Chris@42 622 }
Chris@42 623 {
Chris@42 624 E T1f, T3n, T21, T2C, T1N, T3r, T27, T2K, T1q, T3o, T22, T2F, T1C, T3q, T26;
Chris@42 625 E T2H;
Chris@42 626 {
Chris@42 627 E T19, T1Z, T1e, T20;
Chris@42 628 {
Chris@42 629 E T16, T18, T15, T17;
Chris@42 630 T16 = Rp[WS(rs, 4)];
Chris@42 631 T18 = Rm[WS(rs, 4)];
Chris@42 632 T15 = W[14];
Chris@42 633 T17 = W[15];
Chris@42 634 T19 = FMA(T15, T16, T17 * T18);
Chris@42 635 T1Z = FNMS(T17, T16, T15 * T18);
Chris@42 636 }
Chris@42 637 {
Chris@42 638 E T1b, T1d, T1a, T1c;
Chris@42 639 T1b = Rp[WS(rs, 9)];
Chris@42 640 T1d = Rm[WS(rs, 9)];
Chris@42 641 T1a = W[34];
Chris@42 642 T1c = W[35];
Chris@42 643 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@42 644 T20 = FNMS(T1c, T1b, T1a * T1d);
Chris@42 645 }
Chris@42 646 T1f = T19 + T1e;
Chris@42 647 T3n = T1Z + T20;
Chris@42 648 T21 = T1Z - T20;
Chris@42 649 T2C = T19 - T1e;
Chris@42 650 }
Chris@42 651 {
Chris@42 652 E T1H, T2I, T1M, T2J;
Chris@42 653 {
Chris@42 654 E T1E, T1G, T1D, T1F;
Chris@42 655 T1E = Ip[WS(rs, 8)];
Chris@42 656 T1G = Im[WS(rs, 8)];
Chris@42 657 T1D = W[32];
Chris@42 658 T1F = W[33];
Chris@42 659 T1H = FMA(T1D, T1E, T1F * T1G);
Chris@42 660 T2I = FNMS(T1F, T1E, T1D * T1G);
Chris@42 661 }
Chris@42 662 {
Chris@42 663 E T1J, T1L, T1I, T1K;
Chris@42 664 T1J = Ip[WS(rs, 3)];
Chris@42 665 T1L = Im[WS(rs, 3)];
Chris@42 666 T1I = W[12];
Chris@42 667 T1K = W[13];
Chris@42 668 T1M = FMA(T1I, T1J, T1K * T1L);
Chris@42 669 T2J = FNMS(T1K, T1J, T1I * T1L);
Chris@42 670 }
Chris@42 671 T1N = T1H + T1M;
Chris@42 672 T3r = T2I + T2J;
Chris@42 673 T27 = T1H - T1M;
Chris@42 674 T2K = T2I - T2J;
Chris@42 675 }
Chris@42 676 {
Chris@42 677 E T1k, T2D, T1p, T2E;
Chris@42 678 {
Chris@42 679 E T1h, T1j, T1g, T1i;
Chris@42 680 T1h = Ip[WS(rs, 6)];
Chris@42 681 T1j = Im[WS(rs, 6)];
Chris@42 682 T1g = W[24];
Chris@42 683 T1i = W[25];
Chris@42 684 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@42 685 T2D = FNMS(T1i, T1h, T1g * T1j);
Chris@42 686 }
Chris@42 687 {
Chris@42 688 E T1m, T1o, T1l, T1n;
Chris@42 689 T1m = Ip[WS(rs, 1)];
Chris@42 690 T1o = Im[WS(rs, 1)];
Chris@42 691 T1l = W[4];
Chris@42 692 T1n = W[5];
Chris@42 693 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@42 694 T2E = FNMS(T1n, T1m, T1l * T1o);
Chris@42 695 }
Chris@42 696 T1q = T1k + T1p;
Chris@42 697 T3o = T2D + T2E;
Chris@42 698 T22 = T1k - T1p;
Chris@42 699 T2F = T2D - T2E;
Chris@42 700 }
Chris@42 701 {
Chris@42 702 E T1w, T24, T1B, T25;
Chris@42 703 {
Chris@42 704 E T1t, T1v, T1s, T1u;
Chris@42 705 T1t = Rp[WS(rs, 6)];
Chris@42 706 T1v = Rm[WS(rs, 6)];
Chris@42 707 T1s = W[22];
Chris@42 708 T1u = W[23];
Chris@42 709 T1w = FMA(T1s, T1t, T1u * T1v);
Chris@42 710 T24 = FNMS(T1u, T1t, T1s * T1v);
Chris@42 711 }
Chris@42 712 {
Chris@42 713 E T1y, T1A, T1x, T1z;
Chris@42 714 T1y = Rp[WS(rs, 1)];
Chris@42 715 T1A = Rm[WS(rs, 1)];
Chris@42 716 T1x = W[2];
Chris@42 717 T1z = W[3];
Chris@42 718 T1B = FMA(T1x, T1y, T1z * T1A);
Chris@42 719 T25 = FNMS(T1z, T1y, T1x * T1A);
Chris@42 720 }
Chris@42 721 T1C = T1w + T1B;
Chris@42 722 T3q = T24 + T25;
Chris@42 723 T26 = T24 - T25;
Chris@42 724 T2H = T1w - T1B;
Chris@42 725 }
Chris@42 726 T1r = T1f - T1q;
Chris@42 727 T1O = T1C - T1N;
Chris@42 728 T1P = T1r + T1O;
Chris@42 729 T3p = T3n + T3o;
Chris@42 730 T3s = T3q + T3r;
Chris@42 731 T3K = T3p + T3s;
Chris@42 732 T3A = T3n - T3o;
Chris@42 733 T3B = T3r - T3q;
Chris@42 734 T3Z = T3B - T3A;
Chris@42 735 T1V = T1f + T1q;
Chris@42 736 T1W = T1C + T1N;
Chris@42 737 T1X = T1V + T1W;
Chris@42 738 T23 = T21 + T22;
Chris@42 739 T28 = T26 + T27;
Chris@42 740 T4q = T23 + T28;
Chris@42 741 T2W = T21 - T22;
Chris@42 742 T2X = T26 - T27;
Chris@42 743 T4f = T2W + T2X;
Chris@42 744 T33 = T2C + T2F;
Chris@42 745 T34 = T2H + T2K;
Chris@42 746 T35 = T33 + T34;
Chris@42 747 T2G = T2C - T2F;
Chris@42 748 T2L = T2H - T2K;
Chris@42 749 T2M = T2G + T2L;
Chris@42 750 }
Chris@42 751 {
Chris@42 752 E Tu, T3g, T2c, T2r, T12, T3k, T2f, T2z, TF, T3h, T2d, T2u, TR, T3j, T2i;
Chris@42 753 E T2w;
Chris@42 754 {
Chris@42 755 E To, T2a, Tt, T2b;
Chris@42 756 {
Chris@42 757 E Tl, Tn, Tk, Tm;
Chris@42 758 Tl = Rp[WS(rs, 2)];
Chris@42 759 Tn = Rm[WS(rs, 2)];
Chris@42 760 Tk = W[6];
Chris@42 761 Tm = W[7];
Chris@42 762 To = FMA(Tk, Tl, Tm * Tn);
Chris@42 763 T2a = FNMS(Tm, Tl, Tk * Tn);
Chris@42 764 }
Chris@42 765 {
Chris@42 766 E Tq, Ts, Tp, Tr;
Chris@42 767 Tq = Rp[WS(rs, 7)];
Chris@42 768 Ts = Rm[WS(rs, 7)];
Chris@42 769 Tp = W[26];
Chris@42 770 Tr = W[27];
Chris@42 771 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@42 772 T2b = FNMS(Tr, Tq, Tp * Ts);
Chris@42 773 }
Chris@42 774 Tu = To + Tt;
Chris@42 775 T3g = T2a + T2b;
Chris@42 776 T2c = T2a - T2b;
Chris@42 777 T2r = To - Tt;
Chris@42 778 }
Chris@42 779 {
Chris@42 780 E TW, T2x, T11, T2y;
Chris@42 781 {
Chris@42 782 E TT, TV, TS, TU;
Chris@42 783 TT = Ip[0];
Chris@42 784 TV = Im[0];
Chris@42 785 TS = W[0];
Chris@42 786 TU = W[1];
Chris@42 787 TW = FMA(TS, TT, TU * TV);
Chris@42 788 T2x = FNMS(TU, TT, TS * TV);
Chris@42 789 }
Chris@42 790 {
Chris@42 791 E TY, T10, TX, TZ;
Chris@42 792 TY = Ip[WS(rs, 5)];
Chris@42 793 T10 = Im[WS(rs, 5)];
Chris@42 794 TX = W[20];
Chris@42 795 TZ = W[21];
Chris@42 796 T11 = FMA(TX, TY, TZ * T10);
Chris@42 797 T2y = FNMS(TZ, TY, TX * T10);
Chris@42 798 }
Chris@42 799 T12 = TW + T11;
Chris@42 800 T3k = T2x + T2y;
Chris@42 801 T2f = T11 - TW;
Chris@42 802 T2z = T2x - T2y;
Chris@42 803 }
Chris@42 804 {
Chris@42 805 E Tz, T2s, TE, T2t;
Chris@42 806 {
Chris@42 807 E Tw, Ty, Tv, Tx;
Chris@42 808 Tw = Ip[WS(rs, 4)];
Chris@42 809 Ty = Im[WS(rs, 4)];
Chris@42 810 Tv = W[16];
Chris@42 811 Tx = W[17];
Chris@42 812 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 813 T2s = FNMS(Tx, Tw, Tv * Ty);
Chris@42 814 }
Chris@42 815 {
Chris@42 816 E TB, TD, TA, TC;
Chris@42 817 TB = Ip[WS(rs, 9)];
Chris@42 818 TD = Im[WS(rs, 9)];
Chris@42 819 TA = W[36];
Chris@42 820 TC = W[37];
Chris@42 821 TE = FMA(TA, TB, TC * TD);
Chris@42 822 T2t = FNMS(TC, TB, TA * TD);
Chris@42 823 }
Chris@42 824 TF = Tz + TE;
Chris@42 825 T3h = T2s + T2t;
Chris@42 826 T2d = Tz - TE;
Chris@42 827 T2u = T2s - T2t;
Chris@42 828 }
Chris@42 829 {
Chris@42 830 E TL, T2g, TQ, T2h;
Chris@42 831 {
Chris@42 832 E TI, TK, TH, TJ;
Chris@42 833 TI = Rp[WS(rs, 8)];
Chris@42 834 TK = Rm[WS(rs, 8)];
Chris@42 835 TH = W[30];
Chris@42 836 TJ = W[31];
Chris@42 837 TL = FMA(TH, TI, TJ * TK);
Chris@42 838 T2g = FNMS(TJ, TI, TH * TK);
Chris@42 839 }
Chris@42 840 {
Chris@42 841 E TN, TP, TM, TO;
Chris@42 842 TN = Rp[WS(rs, 3)];
Chris@42 843 TP = Rm[WS(rs, 3)];
Chris@42 844 TM = W[10];
Chris@42 845 TO = W[11];
Chris@42 846 TQ = FMA(TM, TN, TO * TP);
Chris@42 847 T2h = FNMS(TO, TN, TM * TP);
Chris@42 848 }
Chris@42 849 TR = TL + TQ;
Chris@42 850 T3j = T2g + T2h;
Chris@42 851 T2i = T2g - T2h;
Chris@42 852 T2w = TL - TQ;
Chris@42 853 }
Chris@42 854 TG = Tu - TF;
Chris@42 855 T13 = TR - T12;
Chris@42 856 T14 = TG + T13;
Chris@42 857 T3i = T3g + T3h;
Chris@42 858 T3l = T3j + T3k;
Chris@42 859 T3J = T3i + T3l;
Chris@42 860 T3D = T3g - T3h;
Chris@42 861 T3E = T3j - T3k;
Chris@42 862 T40 = T3D + T3E;
Chris@42 863 T1S = Tu + TF;
Chris@42 864 T1T = TR + T12;
Chris@42 865 T1U = T1S + T1T;
Chris@42 866 T2e = T2c + T2d;
Chris@42 867 T2j = T2f - T2i;
Chris@42 868 T4p = T2j - T2e;
Chris@42 869 T2T = T2c - T2d;
Chris@42 870 T2U = T2i + T2f;
Chris@42 871 T4e = T2T + T2U;
Chris@42 872 T30 = T2r + T2u;
Chris@42 873 T31 = T2w + T2z;
Chris@42 874 T32 = T30 + T31;
Chris@42 875 T2v = T2r - T2u;
Chris@42 876 T2A = T2w - T2z;
Chris@42 877 T2B = T2v + T2A;
Chris@42 878 }
Chris@42 879 {
Chris@42 880 E T3y, T1Q, T3x, T3G, T3I, T3C, T3F, T3H, T3z;
Chris@42 881 T3y = KP559016994 * (T14 - T1P);
Chris@42 882 T1Q = T14 + T1P;
Chris@42 883 T3x = FNMS(KP250000000, T1Q, Tj);
Chris@42 884 T3C = T3A + T3B;
Chris@42 885 T3F = T3D - T3E;
Chris@42 886 T3G = FNMS(KP587785252, T3F, KP951056516 * T3C);
Chris@42 887 T3I = FMA(KP951056516, T3F, KP587785252 * T3C);
Chris@42 888 Rm[WS(rs, 9)] = Tj + T1Q;
Chris@42 889 T3H = T3y + T3x;
Chris@42 890 Rm[WS(rs, 5)] = T3H - T3I;
Chris@42 891 Rp[WS(rs, 6)] = T3H + T3I;
Chris@42 892 T3z = T3x - T3y;
Chris@42 893 Rp[WS(rs, 2)] = T3z - T3G;
Chris@42 894 Rm[WS(rs, 1)] = T3z + T3G;
Chris@42 895 }
Chris@42 896 {
Chris@42 897 E T47, T41, T46, T45, T49, T43, T44, T4a, T48;
Chris@42 898 T47 = KP559016994 * (T40 + T3Z);
Chris@42 899 T41 = T3Z - T40;
Chris@42 900 T46 = FMA(KP250000000, T41, T42);
Chris@42 901 T43 = T13 - TG;
Chris@42 902 T44 = T1r - T1O;
Chris@42 903 T45 = FMA(KP587785252, T43, KP951056516 * T44);
Chris@42 904 T49 = FNMS(KP587785252, T44, KP951056516 * T43);
Chris@42 905 Im[WS(rs, 9)] = T41 - T42;
Chris@42 906 T4a = T47 + T46;
Chris@42 907 Im[WS(rs, 5)] = T49 - T4a;
Chris@42 908 Ip[WS(rs, 6)] = T49 + T4a;
Chris@42 909 T48 = T46 - T47;
Chris@42 910 Im[WS(rs, 1)] = T45 - T48;
Chris@42 911 Ip[WS(rs, 2)] = T45 + T48;
Chris@42 912 }
Chris@42 913 {
Chris@42 914 E T3d, T1Y, T3e, T3u, T3w, T3m, T3t, T3v, T3f;
Chris@42 915 T3d = KP559016994 * (T1U - T1X);
Chris@42 916 T1Y = T1U + T1X;
Chris@42 917 T3e = FNMS(KP250000000, T1Y, T1R);
Chris@42 918 T3m = T3i - T3l;
Chris@42 919 T3t = T3p - T3s;
Chris@42 920 T3u = FMA(KP951056516, T3m, KP587785252 * T3t);
Chris@42 921 T3w = FNMS(KP587785252, T3m, KP951056516 * T3t);
Chris@42 922 Rp[0] = T1R + T1Y;
Chris@42 923 T3v = T3e - T3d;
Chris@42 924 Rm[WS(rs, 7)] = T3v - T3w;
Chris@42 925 Rp[WS(rs, 8)] = T3v + T3w;
Chris@42 926 T3f = T3d + T3e;
Chris@42 927 Rp[WS(rs, 4)] = T3f - T3u;
Chris@42 928 Rm[WS(rs, 3)] = T3f + T3u;
Chris@42 929 }
Chris@42 930 {
Chris@42 931 E T3U, T3L, T3V, T3T, T3X, T3R, T3S, T3Y, T3W;
Chris@42 932 T3U = KP559016994 * (T3J - T3K);
Chris@42 933 T3L = T3J + T3K;
Chris@42 934 T3V = FNMS(KP250000000, T3L, T3Q);
Chris@42 935 T3R = T1S - T1T;
Chris@42 936 T3S = T1V - T1W;
Chris@42 937 T3T = FMA(KP951056516, T3R, KP587785252 * T3S);
Chris@42 938 T3X = FNMS(KP951056516, T3S, KP587785252 * T3R);
Chris@42 939 Ip[0] = T3L + T3Q;
Chris@42 940 T3Y = T3V - T3U;
Chris@42 941 Im[WS(rs, 7)] = T3X - T3Y;
Chris@42 942 Ip[WS(rs, 8)] = T3X + T3Y;
Chris@42 943 T3W = T3U + T3V;
Chris@42 944 Im[WS(rs, 3)] = T3T - T3W;
Chris@42 945 Ip[WS(rs, 4)] = T3T + T3W;
Chris@42 946 }
Chris@42 947 {
Chris@42 948 E T2P, T2N, T2O, T2l, T2R, T29, T2k, T2S, T2Q;
Chris@42 949 T2P = KP559016994 * (T2B - T2M);
Chris@42 950 T2N = T2B + T2M;
Chris@42 951 T2O = FNMS(KP250000000, T2N, T2q);
Chris@42 952 T29 = T23 - T28;
Chris@42 953 T2k = T2e + T2j;
Chris@42 954 T2l = FNMS(KP587785252, T2k, KP951056516 * T29);
Chris@42 955 T2R = FMA(KP951056516, T2k, KP587785252 * T29);
Chris@42 956 Rm[WS(rs, 4)] = T2q + T2N;
Chris@42 957 T2S = T2P + T2O;
Chris@42 958 Rm[WS(rs, 8)] = T2R + T2S;
Chris@42 959 Rm[0] = T2S - T2R;
Chris@42 960 T2Q = T2O - T2P;
Chris@42 961 Rp[WS(rs, 3)] = T2l + T2Q;
Chris@42 962 Rp[WS(rs, 7)] = T2Q - T2l;
Chris@42 963 }
Chris@42 964 {
Chris@42 965 E T4w, T4r, T4x, T4v, T4A, T4t, T4u, T4z, T4y;
Chris@42 966 T4w = KP559016994 * (T4p + T4q);
Chris@42 967 T4r = T4p - T4q;
Chris@42 968 T4x = FMA(KP250000000, T4r, T4s);
Chris@42 969 T4t = T2v - T2A;
Chris@42 970 T4u = T2G - T2L;
Chris@42 971 T4v = FMA(KP951056516, T4t, KP587785252 * T4u);
Chris@42 972 T4A = FNMS(KP587785252, T4t, KP951056516 * T4u);
Chris@42 973 Im[WS(rs, 4)] = T4r - T4s;
Chris@42 974 T4z = T4w + T4x;
Chris@42 975 Ip[WS(rs, 3)] = T4z - T4A;
Chris@42 976 Ip[WS(rs, 7)] = T4A + T4z;
Chris@42 977 T4y = T4w - T4x;
Chris@42 978 Im[WS(rs, 8)] = T4v + T4y;
Chris@42 979 Im[0] = T4y - T4v;
Chris@42 980 }
Chris@42 981 {
Chris@42 982 E T36, T38, T39, T2Z, T3b, T2V, T2Y, T3c, T3a;
Chris@42 983 T36 = KP559016994 * (T32 - T35);
Chris@42 984 T38 = T32 + T35;
Chris@42 985 T39 = FNMS(KP250000000, T38, T37);
Chris@42 986 T2V = T2T - T2U;
Chris@42 987 T2Y = T2W - T2X;
Chris@42 988 T2Z = FMA(KP951056516, T2V, KP587785252 * T2Y);
Chris@42 989 T3b = FNMS(KP587785252, T2V, KP951056516 * T2Y);
Chris@42 990 Rp[WS(rs, 5)] = T37 + T38;
Chris@42 991 T3c = T39 - T36;
Chris@42 992 Rm[WS(rs, 6)] = T3b + T3c;
Chris@42 993 Rm[WS(rs, 2)] = T3c - T3b;
Chris@42 994 T3a = T36 + T39;
Chris@42 995 Rp[WS(rs, 1)] = T2Z + T3a;
Chris@42 996 Rp[WS(rs, 9)] = T3a - T2Z;
Chris@42 997 }
Chris@42 998 {
Chris@42 999 E T4g, T4k, T4l, T4d, T4o, T4b, T4c, T4n, T4m;
Chris@42 1000 T4g = KP559016994 * (T4e - T4f);
Chris@42 1001 T4k = T4e + T4f;
Chris@42 1002 T4l = FNMS(KP250000000, T4k, T4j);
Chris@42 1003 T4b = T33 - T34;
Chris@42 1004 T4c = T30 - T31;
Chris@42 1005 T4d = FNMS(KP587785252, T4c, KP951056516 * T4b);
Chris@42 1006 T4o = FMA(KP951056516, T4c, KP587785252 * T4b);
Chris@42 1007 Ip[WS(rs, 5)] = T4k + T4j;
Chris@42 1008 T4n = T4g + T4l;
Chris@42 1009 Ip[WS(rs, 1)] = T4n - T4o;
Chris@42 1010 Ip[WS(rs, 9)] = T4o + T4n;
Chris@42 1011 T4m = T4g - T4l;
Chris@42 1012 Im[WS(rs, 6)] = T4d + T4m;
Chris@42 1013 Im[WS(rs, 2)] = T4m - T4d;
Chris@42 1014 }
Chris@42 1015 }
Chris@42 1016 }
Chris@42 1017 }
Chris@42 1018
Chris@42 1019 static const tw_instr twinstr[] = {
Chris@42 1020 {TW_FULL, 1, 20},
Chris@42 1021 {TW_NEXT, 1, 0}
Chris@42 1022 };
Chris@42 1023
Chris@42 1024 static const hc2c_desc desc = { 20, "hc2cf_20", twinstr, &GENUS, {184, 62, 62, 0} };
Chris@42 1025
Chris@42 1026 void X(codelet_hc2cf_20) (planner *p) {
Chris@42 1027 X(khc2c_register) (p, hc2cf_20, &desc, HC2C_VIA_RDFT);
Chris@42 1028 }
Chris@42 1029 #endif /* HAVE_FMA */