annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cfdft2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:49:01 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hc2cfdft2_20 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 316 FP additions, 238 FP multiplications,
Chris@42 32 * (or, 176 additions, 98 multiplications, 140 fused multiply/add),
Chris@42 33 * 180 stack variables, 5 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cfdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 43 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 44 {
Chris@42 45 INT m;
Chris@42 46 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 47 E T5h, T5C, T5E, T5y, T5w, T5x, T5D, T5z;
Chris@42 48 {
Chris@42 49 E Tm, Tq, Tn, T1, T6, Tg, Tp, Tb, T1i, TU, Tr, TW, Tx, T2B, T1A;
Chris@42 50 E T1u, T2y, T33, T26, T1o, T30, T22, TD, T1Q, T2a, T2e, T2V, T2R, TG, T1V;
Chris@42 51 E TV, TH, TN, T2t, T12, T2p;
Chris@42 52 {
Chris@42 53 E Tw, To, T29, T1h, T1n, T2d, TC, T2U;
Chris@42 54 Tm = W[0];
Chris@42 55 Tq = W[3];
Chris@42 56 Tn = W[2];
Chris@42 57 T1 = W[6];
Chris@42 58 T6 = W[7];
Chris@42 59 Tw = Tm * Tq;
Chris@42 60 To = Tm * Tn;
Chris@42 61 T29 = Tm * T1;
Chris@42 62 T1h = Tn * T1;
Chris@42 63 T1n = Tn * T6;
Chris@42 64 T2d = Tm * T6;
Chris@42 65 Tg = W[5];
Chris@42 66 Tp = W[1];
Chris@42 67 Tb = W[4];
Chris@42 68 {
Chris@42 69 E T21, T25, T1t, T1z;
Chris@42 70 T1i = FMA(Tq, T6, T1h);
Chris@42 71 T25 = Tm * Tg;
Chris@42 72 T1z = Tn * Tg;
Chris@42 73 TU = FMA(Tp, Tq, To);
Chris@42 74 Tr = FNMS(Tp, Tq, To);
Chris@42 75 TW = FNMS(Tp, Tn, Tw);
Chris@42 76 Tx = FMA(Tp, Tn, Tw);
Chris@42 77 T1t = Tn * Tb;
Chris@42 78 T21 = Tm * Tb;
Chris@42 79 T2B = FMA(Tq, Tb, T1z);
Chris@42 80 T1A = FNMS(Tq, Tb, T1z);
Chris@42 81 TC = Tr * Tb;
Chris@42 82 T1u = FMA(Tq, Tg, T1t);
Chris@42 83 T2y = FNMS(Tq, Tg, T1t);
Chris@42 84 T33 = FMA(Tp, Tb, T25);
Chris@42 85 T26 = FNMS(Tp, Tb, T25);
Chris@42 86 T1o = FNMS(Tq, T1, T1n);
Chris@42 87 T30 = FNMS(Tp, Tg, T21);
Chris@42 88 T22 = FMA(Tp, Tg, T21);
Chris@42 89 }
Chris@42 90 TD = FMA(Tx, Tg, TC);
Chris@42 91 T1Q = FNMS(Tx, Tg, TC);
Chris@42 92 T2a = FMA(Tp, T6, T29);
Chris@42 93 T2e = FNMS(Tp, T1, T2d);
Chris@42 94 T2U = Tr * T6;
Chris@42 95 {
Chris@42 96 E T2Q, TE, TM, TF;
Chris@42 97 T2Q = Tr * T1;
Chris@42 98 TF = Tr * Tg;
Chris@42 99 T2V = FNMS(Tx, T1, T2U);
Chris@42 100 T2R = FMA(Tx, T6, T2Q);
Chris@42 101 TG = FNMS(Tx, Tb, TF);
Chris@42 102 T1V = FMA(Tx, Tb, TF);
Chris@42 103 TE = TD * T1;
Chris@42 104 TM = TD * T6;
Chris@42 105 TV = TU * Tb;
Chris@42 106 TH = FMA(TG, T6, TE);
Chris@42 107 TN = FNMS(TG, T1, TM);
Chris@42 108 T2t = TU * T1;
Chris@42 109 T12 = TU * Tg;
Chris@42 110 T2p = TU * T6;
Chris@42 111 }
Chris@42 112 }
Chris@42 113 {
Chris@42 114 E T36, T3Q, T5f, T4D, T5g, T2Y, T4E, T3P, T5R, T5k, T39, TT, T3T, T3m, T49;
Chris@42 115 E T4X, T5T, T5r, T3c, T2i, T3W, T3B, T4o, T4U, T5U, T5u, T3d, T2J, T3X, T3I;
Chris@42 116 E T4v, T4V, T5Q, T5n, T3a, T1G, T3U, T3t, T4g, T4Y;
Chris@42 117 {
Chris@42 118 E T13, T2m, T2q, T2u, T2f, T9, T2O, TA, T2c, T4k, T3i, T5, T2Z, T1e, T2G;
Chris@42 119 E T1O, T2W, TQ, T2C, T1Y, T3v, T27, Tj, T1l, T2v, T3g, T1m, T1D, T2n, T1x;
Chris@42 120 E T2k, T3E, T4c, T2l, T1y, T10, T31, T16, T34, T32, T11, T4B, T3p, T4A, T1T;
Chris@42 121 E T3n, T1b, T2A, T4q, T1U, Te, Tf, T24, T4i, T1r, T4a, T3C, T2s, T43, Tv;
Chris@42 122 E T3L, T2N, T45, TL, T3N, T2T, T2E, T1K;
Chris@42 123 {
Chris@42 124 E T2j, TX, T1B, T1C;
Chris@42 125 {
Chris@42 126 E T1c, T1d, T1M, T1N;
Chris@42 127 {
Chris@42 128 E T2, T3, T7, T8;
Chris@42 129 T7 = Rp[WS(rs, 9)];
Chris@42 130 T8 = Rm[WS(rs, 9)];
Chris@42 131 T2 = Ip[WS(rs, 9)];
Chris@42 132 T2j = FMA(TW, Tg, TV);
Chris@42 133 TX = FNMS(TW, Tg, TV);
Chris@42 134 T13 = FMA(TW, Tb, T12);
Chris@42 135 T2m = FNMS(TW, Tb, T12);
Chris@42 136 T2q = FNMS(TW, T1, T2p);
Chris@42 137 T2u = FMA(TW, T6, T2t);
Chris@42 138 T2f = T7 + T8;
Chris@42 139 T9 = T7 - T8;
Chris@42 140 T3 = Im[WS(rs, 9)];
Chris@42 141 {
Chris@42 142 E Ty, Tz, T2b, T4;
Chris@42 143 Ty = Rp[WS(rs, 2)];
Chris@42 144 Tz = Rm[WS(rs, 2)];
Chris@42 145 T1c = Ip[0];
Chris@42 146 T2b = T2 - T3;
Chris@42 147 T4 = T2 + T3;
Chris@42 148 T2O = Ty - Tz;
Chris@42 149 TA = Ty + Tz;
Chris@42 150 T2c = T2a * T2b;
Chris@42 151 T4k = T2e * T2b;
Chris@42 152 T3i = T6 * T4;
Chris@42 153 T5 = T1 * T4;
Chris@42 154 T1d = Im[0];
Chris@42 155 T1M = Rp[WS(rs, 1)];
Chris@42 156 T1N = Rm[WS(rs, 1)];
Chris@42 157 }
Chris@42 158 }
Chris@42 159 {
Chris@42 160 E TO, TP, T1W, T1X;
Chris@42 161 TO = Rp[WS(rs, 7)];
Chris@42 162 T2Z = T1c - T1d;
Chris@42 163 T1e = T1c + T1d;
Chris@42 164 T2G = T1M + T1N;
Chris@42 165 T1O = T1M - T1N;
Chris@42 166 TP = Rm[WS(rs, 7)];
Chris@42 167 T1W = Rm[WS(rs, 6)];
Chris@42 168 T1X = Rp[WS(rs, 6)];
Chris@42 169 {
Chris@42 170 E Th, Ti, T1j, T1k;
Chris@42 171 Th = Rm[WS(rs, 4)];
Chris@42 172 T2W = TO - TP;
Chris@42 173 TQ = TO + TP;
Chris@42 174 T2C = T1X + T1W;
Chris@42 175 T1Y = T1W - T1X;
Chris@42 176 Ti = Rp[WS(rs, 4)];
Chris@42 177 T1j = Ip[WS(rs, 8)];
Chris@42 178 T1k = Im[WS(rs, 8)];
Chris@42 179 T3v = T1Q * T1Y;
Chris@42 180 T27 = Ti + Th;
Chris@42 181 Tj = Th - Ti;
Chris@42 182 T1l = T1j - T1k;
Chris@42 183 T2v = T1j + T1k;
Chris@42 184 T1B = Rp[WS(rs, 3)];
Chris@42 185 T3g = Tb * Tj;
Chris@42 186 T1m = T1i * T1l;
Chris@42 187 T1C = Rm[WS(rs, 3)];
Chris@42 188 }
Chris@42 189 }
Chris@42 190 }
Chris@42 191 {
Chris@42 192 E T18, T19, T1R, T1S;
Chris@42 193 {
Chris@42 194 E TY, TZ, T1v, T1w, T14, T15;
Chris@42 195 T1v = Ip[WS(rs, 3)];
Chris@42 196 T1w = Im[WS(rs, 3)];
Chris@42 197 TY = Ip[WS(rs, 5)];
Chris@42 198 T1D = T1B + T1C;
Chris@42 199 T2n = T1B - T1C;
Chris@42 200 T1x = T1v - T1w;
Chris@42 201 T2k = T1v + T1w;
Chris@42 202 T3E = T2j * T2n;
Chris@42 203 T4c = T1u * T1D;
Chris@42 204 T2l = T2j * T2k;
Chris@42 205 T1y = T1u * T1x;
Chris@42 206 TZ = Im[WS(rs, 5)];
Chris@42 207 T14 = Rp[WS(rs, 5)];
Chris@42 208 T15 = Rm[WS(rs, 5)];
Chris@42 209 T18 = Rm[0];
Chris@42 210 T10 = TY + TZ;
Chris@42 211 T31 = TY - TZ;
Chris@42 212 T16 = T14 - T15;
Chris@42 213 T34 = T14 + T15;
Chris@42 214 T32 = T30 * T31;
Chris@42 215 T11 = TX * T10;
Chris@42 216 T4B = T30 * T34;
Chris@42 217 T3p = TX * T16;
Chris@42 218 T19 = Rp[0];
Chris@42 219 T1R = Ip[WS(rs, 6)];
Chris@42 220 T1S = Im[WS(rs, 6)];
Chris@42 221 }
Chris@42 222 {
Chris@42 223 E T2r, T23, T1p, T1q;
Chris@42 224 {
Chris@42 225 E Tc, T1a, T2z, Td;
Chris@42 226 Tc = Ip[WS(rs, 4)];
Chris@42 227 T1a = T18 - T19;
Chris@42 228 T4A = T19 + T18;
Chris@42 229 T1T = T1R + T1S;
Chris@42 230 T2z = T1R - T1S;
Chris@42 231 Td = Im[WS(rs, 4)];
Chris@42 232 T3n = Tm * T1a;
Chris@42 233 T1b = Tp * T1a;
Chris@42 234 T2A = T2y * T2z;
Chris@42 235 T4q = T2B * T2z;
Chris@42 236 T1U = T1Q * T1T;
Chris@42 237 T23 = Tc - Td;
Chris@42 238 Te = Tc + Td;
Chris@42 239 }
Chris@42 240 T1p = Rp[WS(rs, 8)];
Chris@42 241 T1q = Rm[WS(rs, 8)];
Chris@42 242 Tf = Tb * Te;
Chris@42 243 T24 = T22 * T23;
Chris@42 244 T4i = T26 * T23;
Chris@42 245 T1r = T1p + T1q;
Chris@42 246 T2r = T1q - T1p;
Chris@42 247 {
Chris@42 248 E T2M, Tu, Ts, Tt;
Chris@42 249 Ts = Ip[WS(rs, 2)];
Chris@42 250 Tt = Im[WS(rs, 2)];
Chris@42 251 T4a = T1i * T1r;
Chris@42 252 T3C = T2u * T2r;
Chris@42 253 T2s = T2q * T2r;
Chris@42 254 T2M = Ts + Tt;
Chris@42 255 Tu = Ts - Tt;
Chris@42 256 {
Chris@42 257 E T2S, TK, TI, TJ, T1I, T1J;
Chris@42 258 TI = Ip[WS(rs, 7)];
Chris@42 259 TJ = Im[WS(rs, 7)];
Chris@42 260 T43 = Tx * Tu;
Chris@42 261 Tv = Tr * Tu;
Chris@42 262 T3L = TG * T2M;
Chris@42 263 T2N = TD * T2M;
Chris@42 264 T2S = TI + TJ;
Chris@42 265 TK = TI - TJ;
Chris@42 266 T1I = Ip[WS(rs, 1)];
Chris@42 267 T1J = Im[WS(rs, 1)];
Chris@42 268 T45 = TN * TK;
Chris@42 269 TL = TH * TK;
Chris@42 270 T3N = T2V * T2S;
Chris@42 271 T2T = T2R * T2S;
Chris@42 272 T2E = T1I - T1J;
Chris@42 273 T1K = T1I + T1J;
Chris@42 274 }
Chris@42 275 }
Chris@42 276 }
Chris@42 277 }
Chris@42 278 }
Chris@42 279 {
Chris@42 280 E T3x, T1L, T2F, T4s, T2P, T2X, T3M, T3O, T35, T4C;
Chris@42 281 T35 = FNMS(T33, T34, T32);
Chris@42 282 T4C = FMA(T33, T31, T4B);
Chris@42 283 T3x = Tq * T1K;
Chris@42 284 T1L = Tn * T1K;
Chris@42 285 T2F = TU * T2E;
Chris@42 286 T4s = TW * T2E;
Chris@42 287 T36 = T2Z - T35;
Chris@42 288 T3Q = T35 + T2Z;
Chris@42 289 T5f = T4A + T4C;
Chris@42 290 T4D = T4A - T4C;
Chris@42 291 T2P = FNMS(TG, T2O, T2N);
Chris@42 292 T2X = FNMS(T2V, T2W, T2T);
Chris@42 293 T3M = FMA(TD, T2O, T3L);
Chris@42 294 T3O = FMA(T2R, T2W, T3N);
Chris@42 295 {
Chris@42 296 E TB, T5j, Tl, T5i, T47, TR, T3h, T3j;
Chris@42 297 {
Chris@42 298 E Ta, Tk, T44, T46;
Chris@42 299 Ta = FNMS(T6, T9, T5);
Chris@42 300 T5g = T2P + T2X;
Chris@42 301 T2Y = T2P - T2X;
Chris@42 302 T4E = T3O - T3M;
Chris@42 303 T3P = T3M + T3O;
Chris@42 304 Tk = FMA(Tg, Tj, Tf);
Chris@42 305 T44 = FMA(Tr, TA, T43);
Chris@42 306 T46 = FMA(TH, TQ, T45);
Chris@42 307 TB = FNMS(Tx, TA, Tv);
Chris@42 308 T5j = Tk + Ta;
Chris@42 309 Tl = Ta - Tk;
Chris@42 310 T5i = T44 + T46;
Chris@42 311 T47 = T44 - T46;
Chris@42 312 TR = FNMS(TN, TQ, TL);
Chris@42 313 T3h = FNMS(Tg, Te, T3g);
Chris@42 314 T3j = FMA(T1, T9, T3i);
Chris@42 315 }
Chris@42 316 {
Chris@42 317 E T3l, T48, T3k, TS;
Chris@42 318 T5R = T5i - T5j;
Chris@42 319 T5k = T5i + T5j;
Chris@42 320 T3l = TB + TR;
Chris@42 321 TS = TB - TR;
Chris@42 322 T48 = T3h + T3j;
Chris@42 323 T3k = T3h - T3j;
Chris@42 324 T39 = TS + Tl;
Chris@42 325 TT = Tl - TS;
Chris@42 326 T3T = T3l + T3k;
Chris@42 327 T3m = T3k - T3l;
Chris@42 328 T49 = T47 + T48;
Chris@42 329 T4X = T47 - T48;
Chris@42 330 }
Chris@42 331 }
Chris@42 332 {
Chris@42 333 E T28, T5q, T20, T5p, T4m, T2g, T3w, T3y;
Chris@42 334 {
Chris@42 335 E T1P, T1Z, T4j, T4l;
Chris@42 336 T1P = FNMS(Tq, T1O, T1L);
Chris@42 337 T1Z = FMA(T1V, T1Y, T1U);
Chris@42 338 T4j = FMA(T22, T27, T4i);
Chris@42 339 T4l = FMA(T2a, T2f, T4k);
Chris@42 340 T28 = FNMS(T26, T27, T24);
Chris@42 341 T5q = T1Z + T1P;
Chris@42 342 T20 = T1P - T1Z;
Chris@42 343 T5p = T4j + T4l;
Chris@42 344 T4m = T4j - T4l;
Chris@42 345 T2g = FNMS(T2e, T2f, T2c);
Chris@42 346 T3w = FNMS(T1V, T1T, T3v);
Chris@42 347 T3y = FMA(Tn, T1O, T3x);
Chris@42 348 }
Chris@42 349 {
Chris@42 350 E T3A, T4n, T3z, T2h;
Chris@42 351 T5T = T5p - T5q;
Chris@42 352 T5r = T5p + T5q;
Chris@42 353 T3A = T28 + T2g;
Chris@42 354 T2h = T28 - T2g;
Chris@42 355 T4n = T3w + T3y;
Chris@42 356 T3z = T3w - T3y;
Chris@42 357 T3c = T2h + T20;
Chris@42 358 T2i = T20 - T2h;
Chris@42 359 T3W = T3A + T3z;
Chris@42 360 T3B = T3z - T3A;
Chris@42 361 T4o = T4m + T4n;
Chris@42 362 T4U = T4m - T4n;
Chris@42 363 }
Chris@42 364 }
Chris@42 365 {
Chris@42 366 E T2D, T5s, T2x, T5t, T4u, T2H, T3D, T3F;
Chris@42 367 {
Chris@42 368 E T2o, T2w, T4r, T4t;
Chris@42 369 T2o = FNMS(T2m, T2n, T2l);
Chris@42 370 T2w = FMA(T2u, T2v, T2s);
Chris@42 371 T4r = FMA(T2y, T2C, T4q);
Chris@42 372 T4t = FMA(TU, T2G, T4s);
Chris@42 373 T2D = FNMS(T2B, T2C, T2A);
Chris@42 374 T5s = T2w + T2o;
Chris@42 375 T2x = T2o - T2w;
Chris@42 376 T5t = T4r + T4t;
Chris@42 377 T4u = T4r - T4t;
Chris@42 378 T2H = FNMS(TW, T2G, T2F);
Chris@42 379 T3D = FNMS(T2q, T2v, T3C);
Chris@42 380 T3F = FMA(T2m, T2k, T3E);
Chris@42 381 }
Chris@42 382 {
Chris@42 383 E T3H, T4p, T3G, T2I;
Chris@42 384 T5U = T5t - T5s;
Chris@42 385 T5u = T5s + T5t;
Chris@42 386 T3H = T2D + T2H;
Chris@42 387 T2I = T2D - T2H;
Chris@42 388 T4p = T3D + T3F;
Chris@42 389 T3G = T3D - T3F;
Chris@42 390 T3d = T2x + T2I;
Chris@42 391 T2J = T2x - T2I;
Chris@42 392 T3X = T3G + T3H;
Chris@42 393 T3I = T3G - T3H;
Chris@42 394 T4v = T4p + T4u;
Chris@42 395 T4V = T4u - T4p;
Chris@42 396 }
Chris@42 397 }
Chris@42 398 {
Chris@42 399 E T1s, T5m, T1g, T5l, T4e, T1E, T3o, T3q;
Chris@42 400 {
Chris@42 401 E T17, T1f, T4b, T4d;
Chris@42 402 T17 = FNMS(T13, T16, T11);
Chris@42 403 T1f = FMA(Tm, T1e, T1b);
Chris@42 404 T4b = FMA(T1o, T1l, T4a);
Chris@42 405 T4d = FMA(T1A, T1x, T4c);
Chris@42 406 T1s = FNMS(T1o, T1r, T1m);
Chris@42 407 T5m = T17 + T1f;
Chris@42 408 T1g = T17 - T1f;
Chris@42 409 T5l = T4b + T4d;
Chris@42 410 T4e = T4b - T4d;
Chris@42 411 T1E = FNMS(T1A, T1D, T1y);
Chris@42 412 T3o = FNMS(Tp, T1e, T3n);
Chris@42 413 T3q = FMA(T13, T10, T3p);
Chris@42 414 }
Chris@42 415 {
Chris@42 416 E T3s, T4f, T3r, T1F;
Chris@42 417 T5Q = T5l - T5m;
Chris@42 418 T5n = T5l + T5m;
Chris@42 419 T3s = T1s + T1E;
Chris@42 420 T1F = T1s - T1E;
Chris@42 421 T4f = T3q + T3o;
Chris@42 422 T3r = T3o - T3q;
Chris@42 423 T3a = T1F + T1g;
Chris@42 424 T1G = T1g - T1F;
Chris@42 425 T3U = T3s + T3r;
Chris@42 426 T3t = T3r - T3s;
Chris@42 427 T4g = T4e + T4f;
Chris@42 428 T4Y = T4e - T4f;
Chris@42 429 }
Chris@42 430 }
Chris@42 431 }
Chris@42 432 }
Chris@42 433 {
Chris@42 434 E T4F, T4G, T4H, T4x, T4z, T41, T4O, T4Q, T40;
Chris@42 435 {
Chris@42 436 E T55, T38, T54, T50, T52, T53, T5e, T5c, T51, T4T;
Chris@42 437 {
Chris@42 438 E T4W, T37, T4Z, T1H, T5b, T5a, T2K, T2L, T4S, T4R;
Chris@42 439 T55 = T4U + T4V;
Chris@42 440 T4W = T4U - T4V;
Chris@42 441 T37 = T2Y + T36;
Chris@42 442 T38 = T36 - T2Y;
Chris@42 443 T54 = T4X + T4Y;
Chris@42 444 T4Z = T4X - T4Y;
Chris@42 445 T1H = TT + T1G;
Chris@42 446 T5b = T1G - TT;
Chris@42 447 T5a = T2J - T2i;
Chris@42 448 T2K = T2i + T2J;
Chris@42 449 T50 = FNMS(KP618033988, T4Z, T4W);
Chris@42 450 T52 = FMA(KP618033988, T4W, T4Z);
Chris@42 451 T2L = T1H + T2K;
Chris@42 452 T4S = T1H - T2K;
Chris@42 453 T53 = T4D - T4E;
Chris@42 454 T4F = T4D + T4E;
Chris@42 455 Im[WS(rs, 4)] = KP500000000 * (T2L - T37);
Chris@42 456 T4R = FMA(KP250000000, T2L, T37);
Chris@42 457 T5e = FMA(KP618033988, T5a, T5b);
Chris@42 458 T5c = FNMS(KP618033988, T5b, T5a);
Chris@42 459 T51 = FNMS(KP559016994, T4S, T4R);
Chris@42 460 T4T = FMA(KP559016994, T4S, T4R);
Chris@42 461 }
Chris@42 462 {
Chris@42 463 E T3b, T4M, T4N, T3e, T3f;
Chris@42 464 {
Chris@42 465 E T4h, T58, T57, T4w, T56, T5d, T59;
Chris@42 466 T4G = T49 + T4g;
Chris@42 467 T4h = T49 - T4g;
Chris@42 468 T58 = T54 - T55;
Chris@42 469 T56 = T54 + T55;
Chris@42 470 Ip[WS(rs, 7)] = KP500000000 * (FMA(KP951056516, T50, T4T));
Chris@42 471 Ip[WS(rs, 3)] = KP500000000 * (FNMS(KP951056516, T50, T4T));
Chris@42 472 Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP951056516, T52, T51)));
Chris@42 473 Im[0] = -(KP500000000 * (FMA(KP951056516, T52, T51)));
Chris@42 474 Rm[WS(rs, 4)] = KP500000000 * (T53 + T56);
Chris@42 475 T57 = FNMS(KP250000000, T56, T53);
Chris@42 476 T4w = T4o - T4v;
Chris@42 477 T4H = T4o + T4v;
Chris@42 478 T3b = T39 + T3a;
Chris@42 479 T4M = T39 - T3a;
Chris@42 480 T5d = FMA(KP559016994, T58, T57);
Chris@42 481 T59 = FNMS(KP559016994, T58, T57);
Chris@42 482 T4x = FMA(KP618033988, T4w, T4h);
Chris@42 483 T4z = FNMS(KP618033988, T4h, T4w);
Chris@42 484 Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T5c, T59));
Chris@42 485 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T5c, T59));
Chris@42 486 Rm[0] = KP500000000 * (FNMS(KP951056516, T5e, T5d));
Chris@42 487 Rm[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5e, T5d));
Chris@42 488 T4N = T3c - T3d;
Chris@42 489 T3e = T3c + T3d;
Chris@42 490 }
Chris@42 491 T3f = T3b + T3e;
Chris@42 492 T41 = T3b - T3e;
Chris@42 493 T4O = FMA(KP618033988, T4N, T4M);
Chris@42 494 T4Q = FNMS(KP618033988, T4M, T4N);
Chris@42 495 Ip[WS(rs, 5)] = KP500000000 * (T38 + T3f);
Chris@42 496 T40 = FNMS(KP250000000, T3f, T38);
Chris@42 497 }
Chris@42 498 }
Chris@42 499 {
Chris@42 500 E T3S, T5Z, T68, T6a, T64, T62;
Chris@42 501 {
Chris@42 502 E T60, T61, T5Y, T5W, T3R, T67, T66, T3K, T5O, T4K, T4J, T5N, T5X, T5P;
Chris@42 503 {
Chris@42 504 E T5S, T5V, T4y, T42, T4I;
Chris@42 505 T60 = T5R + T5Q;
Chris@42 506 T5S = T5Q - T5R;
Chris@42 507 T5V = T5T - T5U;
Chris@42 508 T61 = T5T + T5U;
Chris@42 509 T4y = FNMS(KP559016994, T41, T40);
Chris@42 510 T42 = FMA(KP559016994, T41, T40);
Chris@42 511 T4I = T4G + T4H;
Chris@42 512 T4K = T4G - T4H;
Chris@42 513 Ip[WS(rs, 9)] = KP500000000 * (FMA(KP951056516, T4x, T42));
Chris@42 514 Ip[WS(rs, 1)] = KP500000000 * (FNMS(KP951056516, T4x, T42));
Chris@42 515 Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP951056516, T4z, T4y)));
Chris@42 516 Im[WS(rs, 2)] = -(KP500000000 * (FMA(KP951056516, T4z, T4y)));
Chris@42 517 Rp[WS(rs, 5)] = KP500000000 * (T4F + T4I);
Chris@42 518 T4J = FNMS(KP250000000, T4I, T4F);
Chris@42 519 T5Y = FMA(KP618033988, T5S, T5V);
Chris@42 520 T5W = FNMS(KP618033988, T5V, T5S);
Chris@42 521 }
Chris@42 522 T3S = T3Q - T3P;
Chris@42 523 T3R = T3P + T3Q;
Chris@42 524 {
Chris@42 525 E T4L, T4P, T3u, T3J;
Chris@42 526 T4L = FMA(KP559016994, T4K, T4J);
Chris@42 527 T4P = FNMS(KP559016994, T4K, T4J);
Chris@42 528 T3u = T3m + T3t;
Chris@42 529 T67 = T3t - T3m;
Chris@42 530 T66 = T3I - T3B;
Chris@42 531 T3J = T3B + T3I;
Chris@42 532 Rp[WS(rs, 9)] = KP500000000 * (FNMS(KP951056516, T4O, T4L));
Chris@42 533 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T4O, T4L));
Chris@42 534 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T4Q, T4P));
Chris@42 535 Rm[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T4Q, T4P));
Chris@42 536 T3K = T3u + T3J;
Chris@42 537 T5O = T3J - T3u;
Chris@42 538 }
Chris@42 539 Im[WS(rs, 9)] = KP500000000 * (T3K - T3R);
Chris@42 540 T5N = FMA(KP250000000, T3K, T3R);
Chris@42 541 T5Z = T5f - T5g;
Chris@42 542 T5h = T5f + T5g;
Chris@42 543 T68 = FNMS(KP618033988, T67, T66);
Chris@42 544 T6a = FMA(KP618033988, T66, T67);
Chris@42 545 T5X = FNMS(KP559016994, T5O, T5N);
Chris@42 546 T5P = FMA(KP559016994, T5O, T5N);
Chris@42 547 Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP951056516, T5W, T5P)));
Chris@42 548 Ip[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5W, T5P));
Chris@42 549 Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T5Y, T5X)));
Chris@42 550 Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T5Y, T5X));
Chris@42 551 T64 = T60 - T61;
Chris@42 552 T62 = T60 + T61;
Chris@42 553 }
Chris@42 554 {
Chris@42 555 E T5o, T5v, T5M, T5K, T5A, T5B, T3Z, T5G, T5I, T5J, T63, T5F, T5L, T5H;
Chris@42 556 T5o = T5k + T5n;
Chris@42 557 T5I = T5k - T5n;
Chris@42 558 T5J = T5u - T5r;
Chris@42 559 T5v = T5r + T5u;
Chris@42 560 Rm[WS(rs, 9)] = KP500000000 * (T5Z + T62);
Chris@42 561 T63 = FNMS(KP250000000, T62, T5Z);
Chris@42 562 T5M = FMA(KP618033988, T5I, T5J);
Chris@42 563 T5K = FNMS(KP618033988, T5J, T5I);
Chris@42 564 {
Chris@42 565 E T65, T69, T3V, T3Y;
Chris@42 566 T65 = FNMS(KP559016994, T64, T63);
Chris@42 567 T69 = FMA(KP559016994, T64, T63);
Chris@42 568 T3V = T3T + T3U;
Chris@42 569 T5A = T3T - T3U;
Chris@42 570 T5B = T3W - T3X;
Chris@42 571 T3Y = T3W + T3X;
Chris@42 572 Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T68, T65));
Chris@42 573 Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T68, T65));
Chris@42 574 Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP951056516, T6a, T69));
Chris@42 575 Rp[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T6a, T69));
Chris@42 576 T3Z = T3V + T3Y;
Chris@42 577 T5G = T3V - T3Y;
Chris@42 578 }
Chris@42 579 Ip[0] = KP500000000 * (T3S + T3Z);
Chris@42 580 T5F = FNMS(KP250000000, T3Z, T3S);
Chris@42 581 T5C = FMA(KP618033988, T5B, T5A);
Chris@42 582 T5E = FNMS(KP618033988, T5A, T5B);
Chris@42 583 T5L = FNMS(KP559016994, T5G, T5F);
Chris@42 584 T5H = FMA(KP559016994, T5G, T5F);
Chris@42 585 Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T5K, T5H)));
Chris@42 586 Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T5K, T5H));
Chris@42 587 Im[WS(rs, 7)] = -(KP500000000 * (FNMS(KP951056516, T5M, T5L)));
Chris@42 588 Ip[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5M, T5L));
Chris@42 589 T5y = T5o - T5v;
Chris@42 590 T5w = T5o + T5v;
Chris@42 591 }
Chris@42 592 }
Chris@42 593 }
Chris@42 594 }
Chris@42 595 }
Chris@42 596 Rp[0] = KP500000000 * (T5h + T5w);
Chris@42 597 T5x = FNMS(KP250000000, T5w, T5h);
Chris@42 598 T5D = FNMS(KP559016994, T5y, T5x);
Chris@42 599 T5z = FMA(KP559016994, T5y, T5x);
Chris@42 600 Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T5C, T5z));
Chris@42 601 Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T5C, T5z));
Chris@42 602 Rm[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T5E, T5D));
Chris@42 603 Rp[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5E, T5D));
Chris@42 604 }
Chris@42 605 }
Chris@42 606 }
Chris@42 607
Chris@42 608 static const tw_instr twinstr[] = {
Chris@42 609 {TW_CEXP, 1, 1},
Chris@42 610 {TW_CEXP, 1, 3},
Chris@42 611 {TW_CEXP, 1, 9},
Chris@42 612 {TW_CEXP, 1, 19},
Chris@42 613 {TW_NEXT, 1, 0}
Chris@42 614 };
Chris@42 615
Chris@42 616 static const hc2c_desc desc = { 20, "hc2cfdft2_20", twinstr, &GENUS, {176, 98, 140, 0} };
Chris@42 617
Chris@42 618 void X(codelet_hc2cfdft2_20) (planner *p) {
Chris@42 619 X(khc2c_register) (p, hc2cfdft2_20, &desc, HC2C_VIA_DFT);
Chris@42 620 }
Chris@42 621 #else /* HAVE_FMA */
Chris@42 622
Chris@42 623 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 20 -dit -name hc2cfdft2_20 -include hc2cf.h */
Chris@42 624
Chris@42 625 /*
Chris@42 626 * This function contains 316 FP additions, 180 FP multiplications,
Chris@42 627 * (or, 244 additions, 108 multiplications, 72 fused multiply/add),
Chris@42 628 * 134 stack variables, 5 constants, and 80 memory accesses
Chris@42 629 */
Chris@42 630 #include "hc2cf.h"
Chris@42 631
Chris@42 632 static void hc2cfdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 633 {
Chris@42 634 DK(KP125000000, +0.125000000000000000000000000000000000000000000);
Chris@42 635 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 636 DK(KP279508497, +0.279508497187473712051146708591409529430077295);
Chris@42 637 DK(KP293892626, +0.293892626146236564584352977319536384298826219);
Chris@42 638 DK(KP475528258, +0.475528258147576786058219666689691071702849317);
Chris@42 639 {
Chris@42 640 INT m;
Chris@42 641 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 642 E T4, T7, Tm, To, Tq, Tu, T1I, T1G, T8, T5, Ta, T1u, T2u, Tg, T2s;
Chris@42 643 E T21, T1A, T1Z, T1O, T2I, T1K, T2G, Tw, TC, T2a, T2e, TH, TI, TJ, TX;
Chris@42 644 E T2D, TN, T2B, T26, T1n, TZ, T24, T1j;
Chris@42 645 {
Chris@42 646 E T9, T1y, Te, T1t, T6, T1z, Tf, T1s;
Chris@42 647 {
Chris@42 648 E Tn, Tt, Tp, Ts;
Chris@42 649 T4 = W[0];
Chris@42 650 T7 = W[1];
Chris@42 651 Tm = W[2];
Chris@42 652 To = W[3];
Chris@42 653 Tn = T4 * Tm;
Chris@42 654 Tt = T7 * Tm;
Chris@42 655 Tp = T7 * To;
Chris@42 656 Ts = T4 * To;
Chris@42 657 Tq = Tn - Tp;
Chris@42 658 Tu = Ts + Tt;
Chris@42 659 T1I = Ts - Tt;
Chris@42 660 T1G = Tn + Tp;
Chris@42 661 T8 = W[5];
Chris@42 662 T9 = T7 * T8;
Chris@42 663 T1y = Tm * T8;
Chris@42 664 Te = T4 * T8;
Chris@42 665 T1t = To * T8;
Chris@42 666 T5 = W[4];
Chris@42 667 T6 = T4 * T5;
Chris@42 668 T1z = To * T5;
Chris@42 669 Tf = T7 * T5;
Chris@42 670 T1s = Tm * T5;
Chris@42 671 }
Chris@42 672 Ta = T6 - T9;
Chris@42 673 T1u = T1s + T1t;
Chris@42 674 T2u = T1y + T1z;
Chris@42 675 Tg = Te + Tf;
Chris@42 676 T2s = T1s - T1t;
Chris@42 677 T21 = Te - Tf;
Chris@42 678 T1A = T1y - T1z;
Chris@42 679 T1Z = T6 + T9;
Chris@42 680 {
Chris@42 681 E T1M, T1N, T1H, T1J;
Chris@42 682 T1M = T1G * T8;
Chris@42 683 T1N = T1I * T5;
Chris@42 684 T1O = T1M + T1N;
Chris@42 685 T2I = T1M - T1N;
Chris@42 686 T1H = T1G * T5;
Chris@42 687 T1J = T1I * T8;
Chris@42 688 T1K = T1H - T1J;
Chris@42 689 T2G = T1H + T1J;
Chris@42 690 {
Chris@42 691 E Tr, Tv, TA, TB;
Chris@42 692 Tr = Tq * T5;
Chris@42 693 Tv = Tu * T8;
Chris@42 694 Tw = Tr + Tv;
Chris@42 695 TA = Tq * T8;
Chris@42 696 TB = Tu * T5;
Chris@42 697 TC = TA - TB;
Chris@42 698 T2a = Tr - Tv;
Chris@42 699 T2e = TA + TB;
Chris@42 700 TH = W[6];
Chris@42 701 TI = W[7];
Chris@42 702 TJ = FMA(Tq, TH, Tu * TI);
Chris@42 703 TX = FMA(Tw, TH, TC * TI);
Chris@42 704 T2D = FMA(T1G, TH, T1I * TI);
Chris@42 705 TN = FNMS(Tu, TH, Tq * TI);
Chris@42 706 T2B = FNMS(T1I, TH, T1G * TI);
Chris@42 707 T26 = FNMS(T7, TH, T4 * TI);
Chris@42 708 T1n = FNMS(To, TH, Tm * TI);
Chris@42 709 TZ = FNMS(TC, TH, Tw * TI);
Chris@42 710 T24 = FMA(T4, TH, T7 * TI);
Chris@42 711 T1j = FMA(Tm, TH, To * TI);
Chris@42 712 }
Chris@42 713 }
Chris@42 714 }
Chris@42 715 {
Chris@42 716 E Tl, T3n, T1i, T2Q, T47, T50, T4S, T5i, T2M, T2T, T4I, T5f, T4L, T5e, T4P;
Chris@42 717 E T5h, T2r, T2S, T1X, T2P, T31, T3u, T36, T3t, T3E, T4l, T3U, T4j, T3h, T3r;
Chris@42 718 E T3J, T4m, T3c, T3q, T3P, T4i, TS, T51, T3m, T48;
Chris@42 719 {
Chris@42 720 E T3, T45, T1V, T3f, Tz, TF, TW, T3A, TM, TQ, T11, T3B, Td, Tj, T1Q;
Chris@42 721 E T3e, T19, T3L, T23, T39, T2p, T3S, T2z, T34, T1E, T3G, T2K, T2Y, T1g, T3M;
Chris@42 722 E T28, T3a, T2i, T3R, T2w, T33, T1r, T3F, T2F, T2X, T4N, T4O;
Chris@42 723 {
Chris@42 724 E T1, T2, T1R, T1S, T1T, T1U;
Chris@42 725 T1 = Ip[0];
Chris@42 726 T2 = Im[0];
Chris@42 727 T1R = T1 + T2;
Chris@42 728 T1S = Rp[0];
Chris@42 729 T1T = Rm[0];
Chris@42 730 T1U = T1S - T1T;
Chris@42 731 T3 = T1 - T2;
Chris@42 732 T45 = T1S + T1T;
Chris@42 733 T1V = FNMS(T7, T1U, T4 * T1R);
Chris@42 734 T3f = FMA(T4, T1U, T7 * T1R);
Chris@42 735 }
Chris@42 736 {
Chris@42 737 E Tx, Ty, TU, TD, TE, TV;
Chris@42 738 Tx = Ip[WS(rs, 2)];
Chris@42 739 Ty = Im[WS(rs, 2)];
Chris@42 740 TU = Tx - Ty;
Chris@42 741 TD = Rp[WS(rs, 2)];
Chris@42 742 TE = Rm[WS(rs, 2)];
Chris@42 743 TV = TD + TE;
Chris@42 744 Tz = Tx + Ty;
Chris@42 745 TF = TD - TE;
Chris@42 746 TW = FNMS(Tu, TV, Tq * TU);
Chris@42 747 T3A = FMA(Tu, TU, Tq * TV);
Chris@42 748 }
Chris@42 749 {
Chris@42 750 E TK, TL, TY, TO, TP, T10;
Chris@42 751 TK = Ip[WS(rs, 7)];
Chris@42 752 TL = Im[WS(rs, 7)];
Chris@42 753 TY = TK - TL;
Chris@42 754 TO = Rp[WS(rs, 7)];
Chris@42 755 TP = Rm[WS(rs, 7)];
Chris@42 756 T10 = TO + TP;
Chris@42 757 TM = TK + TL;
Chris@42 758 TQ = TO - TP;
Chris@42 759 T11 = FNMS(TZ, T10, TX * TY);
Chris@42 760 T3B = FMA(TZ, TY, TX * T10);
Chris@42 761 }
Chris@42 762 {
Chris@42 763 E Tb, Tc, T1L, Th, Ti, T1P;
Chris@42 764 Tb = Ip[WS(rs, 5)];
Chris@42 765 Tc = Im[WS(rs, 5)];
Chris@42 766 T1L = Tb + Tc;
Chris@42 767 Th = Rp[WS(rs, 5)];
Chris@42 768 Ti = Rm[WS(rs, 5)];
Chris@42 769 T1P = Th - Ti;
Chris@42 770 Td = Tb - Tc;
Chris@42 771 Tj = Th + Ti;
Chris@42 772 T1Q = FNMS(T1O, T1P, T1K * T1L);
Chris@42 773 T3e = FMA(T1K, T1P, T1O * T1L);
Chris@42 774 }
Chris@42 775 {
Chris@42 776 E T15, T20, T18, T22;
Chris@42 777 {
Chris@42 778 E T13, T14, T16, T17;
Chris@42 779 T13 = Ip[WS(rs, 4)];
Chris@42 780 T14 = Im[WS(rs, 4)];
Chris@42 781 T15 = T13 + T14;
Chris@42 782 T20 = T13 - T14;
Chris@42 783 T16 = Rp[WS(rs, 4)];
Chris@42 784 T17 = Rm[WS(rs, 4)];
Chris@42 785 T18 = T16 - T17;
Chris@42 786 T22 = T16 + T17;
Chris@42 787 }
Chris@42 788 T19 = FNMS(T8, T18, T5 * T15);
Chris@42 789 T3L = FMA(T21, T20, T1Z * T22);
Chris@42 790 T23 = FNMS(T21, T22, T1Z * T20);
Chris@42 791 T39 = FMA(T8, T15, T5 * T18);
Chris@42 792 }
Chris@42 793 {
Chris@42 794 E T2l, T2x, T2o, T2y;
Chris@42 795 {
Chris@42 796 E T2j, T2k, T2m, T2n;
Chris@42 797 T2j = Ip[WS(rs, 1)];
Chris@42 798 T2k = Im[WS(rs, 1)];
Chris@42 799 T2l = T2j + T2k;
Chris@42 800 T2x = T2j - T2k;
Chris@42 801 T2m = Rp[WS(rs, 1)];
Chris@42 802 T2n = Rm[WS(rs, 1)];
Chris@42 803 T2o = T2m - T2n;
Chris@42 804 T2y = T2m + T2n;
Chris@42 805 }
Chris@42 806 T2p = FNMS(To, T2o, Tm * T2l);
Chris@42 807 T3S = FMA(T1I, T2x, T1G * T2y);
Chris@42 808 T2z = FNMS(T1I, T2y, T1G * T2x);
Chris@42 809 T34 = FMA(To, T2l, Tm * T2o);
Chris@42 810 }
Chris@42 811 {
Chris@42 812 E T1x, T2H, T1D, T2J;
Chris@42 813 {
Chris@42 814 E T1v, T1w, T1B, T1C;
Chris@42 815 T1v = Ip[WS(rs, 3)];
Chris@42 816 T1w = Im[WS(rs, 3)];
Chris@42 817 T1x = T1v - T1w;
Chris@42 818 T2H = T1v + T1w;
Chris@42 819 T1B = Rp[WS(rs, 3)];
Chris@42 820 T1C = Rm[WS(rs, 3)];
Chris@42 821 T1D = T1B + T1C;
Chris@42 822 T2J = T1B - T1C;
Chris@42 823 }
Chris@42 824 T1E = FNMS(T1A, T1D, T1u * T1x);
Chris@42 825 T3G = FMA(T1u, T1D, T1A * T1x);
Chris@42 826 T2K = FNMS(T2I, T2J, T2G * T2H);
Chris@42 827 T2Y = FMA(T2G, T2J, T2I * T2H);
Chris@42 828 }
Chris@42 829 {
Chris@42 830 E T1c, T25, T1f, T27;
Chris@42 831 {
Chris@42 832 E T1a, T1b, T1d, T1e;
Chris@42 833 T1a = Ip[WS(rs, 9)];
Chris@42 834 T1b = Im[WS(rs, 9)];
Chris@42 835 T1c = T1a + T1b;
Chris@42 836 T25 = T1a - T1b;
Chris@42 837 T1d = Rp[WS(rs, 9)];
Chris@42 838 T1e = Rm[WS(rs, 9)];
Chris@42 839 T1f = T1d - T1e;
Chris@42 840 T27 = T1d + T1e;
Chris@42 841 }
Chris@42 842 T1g = FNMS(TI, T1f, TH * T1c);
Chris@42 843 T3M = FMA(T26, T25, T24 * T27);
Chris@42 844 T28 = FNMS(T26, T27, T24 * T25);
Chris@42 845 T3a = FMA(TI, T1c, TH * T1f);
Chris@42 846 }
Chris@42 847 {
Chris@42 848 E T2d, T2t, T2h, T2v;
Chris@42 849 {
Chris@42 850 E T2b, T2c, T2f, T2g;
Chris@42 851 T2b = Ip[WS(rs, 6)];
Chris@42 852 T2c = Im[WS(rs, 6)];
Chris@42 853 T2d = T2b + T2c;
Chris@42 854 T2t = T2b - T2c;
Chris@42 855 T2f = Rp[WS(rs, 6)];
Chris@42 856 T2g = Rm[WS(rs, 6)];
Chris@42 857 T2h = T2f - T2g;
Chris@42 858 T2v = T2f + T2g;
Chris@42 859 }
Chris@42 860 T2i = FNMS(T2e, T2h, T2a * T2d);
Chris@42 861 T3R = FMA(T2u, T2t, T2s * T2v);
Chris@42 862 T2w = FNMS(T2u, T2v, T2s * T2t);
Chris@42 863 T33 = FMA(T2e, T2d, T2a * T2h);
Chris@42 864 }
Chris@42 865 {
Chris@42 866 E T1m, T2E, T1q, T2C;
Chris@42 867 {
Chris@42 868 E T1k, T1l, T1o, T1p;
Chris@42 869 T1k = Ip[WS(rs, 8)];
Chris@42 870 T1l = Im[WS(rs, 8)];
Chris@42 871 T1m = T1k - T1l;
Chris@42 872 T2E = T1k + T1l;
Chris@42 873 T1o = Rp[WS(rs, 8)];
Chris@42 874 T1p = Rm[WS(rs, 8)];
Chris@42 875 T1q = T1o + T1p;
Chris@42 876 T2C = T1p - T1o;
Chris@42 877 }
Chris@42 878 T1r = FNMS(T1n, T1q, T1j * T1m);
Chris@42 879 T3F = FMA(T1j, T1q, T1n * T1m);
Chris@42 880 T2F = FMA(T2B, T2C, T2D * T2E);
Chris@42 881 T2X = FNMS(T2B, T2E, T2D * T2C);
Chris@42 882 }
Chris@42 883 {
Chris@42 884 E Tk, T12, T1h, T46;
Chris@42 885 Tk = FNMS(Tg, Tj, Ta * Td);
Chris@42 886 Tl = T3 - Tk;
Chris@42 887 T3n = Tk + T3;
Chris@42 888 T12 = TW - T11;
Chris@42 889 T1h = T19 - T1g;
Chris@42 890 T1i = T12 - T1h;
Chris@42 891 T2Q = T12 + T1h;
Chris@42 892 T46 = FMA(Ta, Tj, Tg * Td);
Chris@42 893 T47 = T45 - T46;
Chris@42 894 T50 = T45 + T46;
Chris@42 895 {
Chris@42 896 E T4Q, T4R, T2A, T2L;
Chris@42 897 T4Q = T2F + T2K;
Chris@42 898 T4R = T3R + T3S;
Chris@42 899 T4S = T4Q + T4R;
Chris@42 900 T5i = T4R - T4Q;
Chris@42 901 T2A = T2w - T2z;
Chris@42 902 T2L = T2F - T2K;
Chris@42 903 T2M = T2A - T2L;
Chris@42 904 T2T = T2L + T2A;
Chris@42 905 }
Chris@42 906 }
Chris@42 907 {
Chris@42 908 E T4G, T4H, T4J, T4K;
Chris@42 909 T4G = T3A + T3B;
Chris@42 910 T4H = T19 + T1g;
Chris@42 911 T4I = T4G + T4H;
Chris@42 912 T5f = T4G - T4H;
Chris@42 913 T4J = T3F + T3G;
Chris@42 914 T4K = T1Q + T1V;
Chris@42 915 T4L = T4J + T4K;
Chris@42 916 T5e = T4J - T4K;
Chris@42 917 }
Chris@42 918 T4N = T3L + T3M;
Chris@42 919 T4O = T2i + T2p;
Chris@42 920 T4P = T4N + T4O;
Chris@42 921 T5h = T4N - T4O;
Chris@42 922 {
Chris@42 923 E T29, T2q, T1F, T1W;
Chris@42 924 T29 = T23 - T28;
Chris@42 925 T2q = T2i - T2p;
Chris@42 926 T2r = T29 - T2q;
Chris@42 927 T2S = T29 + T2q;
Chris@42 928 T1F = T1r - T1E;
Chris@42 929 T1W = T1Q - T1V;
Chris@42 930 T1X = T1F + T1W;
Chris@42 931 T2P = T1W - T1F;
Chris@42 932 }
Chris@42 933 {
Chris@42 934 E T3C, T3D, T3N, T3O;
Chris@42 935 {
Chris@42 936 E T2Z, T30, T32, T35;
Chris@42 937 T2Z = T2X - T2Y;
Chris@42 938 T30 = T2w + T2z;
Chris@42 939 T31 = T2Z - T30;
Chris@42 940 T3u = T2Z + T30;
Chris@42 941 T32 = T23 + T28;
Chris@42 942 T35 = T33 + T34;
Chris@42 943 T36 = T32 + T35;
Chris@42 944 T3t = T32 - T35;
Chris@42 945 }
Chris@42 946 T3C = T3A - T3B;
Chris@42 947 T3D = T3a - T39;
Chris@42 948 T3E = T3C + T3D;
Chris@42 949 T4l = T3C - T3D;
Chris@42 950 {
Chris@42 951 E T3Q, T3T, T3d, T3g;
Chris@42 952 T3Q = T2X + T2Y;
Chris@42 953 T3T = T3R - T3S;
Chris@42 954 T3U = T3Q + T3T;
Chris@42 955 T4j = T3T - T3Q;
Chris@42 956 T3d = T1r + T1E;
Chris@42 957 T3g = T3e + T3f;
Chris@42 958 T3h = T3d + T3g;
Chris@42 959 T3r = T3d - T3g;
Chris@42 960 }
Chris@42 961 {
Chris@42 962 E T3H, T3I, T38, T3b;
Chris@42 963 T3H = T3F - T3G;
Chris@42 964 T3I = T3e - T3f;
Chris@42 965 T3J = T3H + T3I;
Chris@42 966 T4m = T3H - T3I;
Chris@42 967 T38 = TW + T11;
Chris@42 968 T3b = T39 + T3a;
Chris@42 969 T3c = T38 + T3b;
Chris@42 970 T3q = T38 - T3b;
Chris@42 971 }
Chris@42 972 T3N = T3L - T3M;
Chris@42 973 T3O = T34 - T33;
Chris@42 974 T3P = T3N + T3O;
Chris@42 975 T4i = T3N - T3O;
Chris@42 976 {
Chris@42 977 E TG, TR, T3k, T3l;
Chris@42 978 TG = FNMS(TC, TF, Tw * Tz);
Chris@42 979 TR = FNMS(TN, TQ, TJ * TM);
Chris@42 980 TS = TG - TR;
Chris@42 981 T51 = TG + TR;
Chris@42 982 T3k = FMA(TC, Tz, Tw * TF);
Chris@42 983 T3l = FMA(TN, TM, TJ * TQ);
Chris@42 984 T3m = T3k + T3l;
Chris@42 985 T48 = T3l - T3k;
Chris@42 986 }
Chris@42 987 }
Chris@42 988 }
Chris@42 989 {
Chris@42 990 E T3W, T3Y, TT, T2O, T3x, T3y, T3X, T3z;
Chris@42 991 {
Chris@42 992 E T3K, T3V, T1Y, T2N;
Chris@42 993 T3K = T3E - T3J;
Chris@42 994 T3V = T3P - T3U;
Chris@42 995 T3W = FMA(KP475528258, T3K, KP293892626 * T3V);
Chris@42 996 T3Y = FNMS(KP293892626, T3K, KP475528258 * T3V);
Chris@42 997 TT = Tl - TS;
Chris@42 998 T1Y = T1i + T1X;
Chris@42 999 T2N = T2r + T2M;
Chris@42 1000 T2O = T1Y + T2N;
Chris@42 1001 T3x = KP279508497 * (T1Y - T2N);
Chris@42 1002 T3y = FNMS(KP125000000, T2O, KP500000000 * TT);
Chris@42 1003 }
Chris@42 1004 Ip[WS(rs, 5)] = KP500000000 * (TT + T2O);
Chris@42 1005 T3X = T3x - T3y;
Chris@42 1006 Im[WS(rs, 2)] = T3X - T3Y;
Chris@42 1007 Im[WS(rs, 6)] = T3X + T3Y;
Chris@42 1008 T3z = T3x + T3y;
Chris@42 1009 Ip[WS(rs, 1)] = T3z - T3W;
Chris@42 1010 Ip[WS(rs, 9)] = T3z + T3W;
Chris@42 1011 }
Chris@42 1012 {
Chris@42 1013 E T41, T4d, T49, T4a, T44, T4b, T4e, T4c;
Chris@42 1014 {
Chris@42 1015 E T3Z, T40, T42, T43;
Chris@42 1016 T3Z = T1i - T1X;
Chris@42 1017 T40 = T2r - T2M;
Chris@42 1018 T41 = FMA(KP475528258, T3Z, KP293892626 * T40);
Chris@42 1019 T4d = FNMS(KP293892626, T3Z, KP475528258 * T40);
Chris@42 1020 T49 = T47 + T48;
Chris@42 1021 T42 = T3E + T3J;
Chris@42 1022 T43 = T3P + T3U;
Chris@42 1023 T4a = T42 + T43;
Chris@42 1024 T44 = KP279508497 * (T42 - T43);
Chris@42 1025 T4b = FNMS(KP125000000, T4a, KP500000000 * T49);
Chris@42 1026 }
Chris@42 1027 Rp[WS(rs, 5)] = KP500000000 * (T49 + T4a);
Chris@42 1028 T4e = T4b - T44;
Chris@42 1029 Rm[WS(rs, 6)] = T4d + T4e;
Chris@42 1030 Rm[WS(rs, 2)] = T4e - T4d;
Chris@42 1031 T4c = T44 + T4b;
Chris@42 1032 Rp[WS(rs, 1)] = T41 + T4c;
Chris@42 1033 Rp[WS(rs, 9)] = T4c - T41;
Chris@42 1034 }
Chris@42 1035 {
Chris@42 1036 E T4o, T4q, T2W, T2V, T4f, T4g, T4p, T4h;
Chris@42 1037 {
Chris@42 1038 E T4k, T4n, T2R, T2U;
Chris@42 1039 T4k = T4i - T4j;
Chris@42 1040 T4n = T4l - T4m;
Chris@42 1041 T4o = FNMS(KP293892626, T4n, KP475528258 * T4k);
Chris@42 1042 T4q = FMA(KP475528258, T4n, KP293892626 * T4k);
Chris@42 1043 T2W = TS + Tl;
Chris@42 1044 T2R = T2P - T2Q;
Chris@42 1045 T2U = T2S + T2T;
Chris@42 1046 T2V = T2R - T2U;
Chris@42 1047 T4f = FMA(KP500000000, T2W, KP125000000 * T2V);
Chris@42 1048 T4g = KP279508497 * (T2R + T2U);
Chris@42 1049 }
Chris@42 1050 Im[WS(rs, 4)] = KP500000000 * (T2V - T2W);
Chris@42 1051 T4p = T4g - T4f;
Chris@42 1052 Im[0] = T4p - T4q;
Chris@42 1053 Im[WS(rs, 8)] = T4p + T4q;
Chris@42 1054 T4h = T4f + T4g;
Chris@42 1055 Ip[WS(rs, 3)] = T4h - T4o;
Chris@42 1056 Ip[WS(rs, 7)] = T4h + T4o;
Chris@42 1057 }
Chris@42 1058 {
Chris@42 1059 E T4t, T4B, T4u, T4x, T4y, T4z, T4C, T4A;
Chris@42 1060 {
Chris@42 1061 E T4r, T4s, T4v, T4w;
Chris@42 1062 T4r = T2S - T2T;
Chris@42 1063 T4s = T2Q + T2P;
Chris@42 1064 T4t = FNMS(KP293892626, T4s, KP475528258 * T4r);
Chris@42 1065 T4B = FMA(KP475528258, T4s, KP293892626 * T4r);
Chris@42 1066 T4u = T47 - T48;
Chris@42 1067 T4v = T4l + T4m;
Chris@42 1068 T4w = T4i + T4j;
Chris@42 1069 T4x = T4v + T4w;
Chris@42 1070 T4y = FNMS(KP125000000, T4x, KP500000000 * T4u);
Chris@42 1071 T4z = KP279508497 * (T4v - T4w);
Chris@42 1072 }
Chris@42 1073 Rm[WS(rs, 4)] = KP500000000 * (T4u + T4x);
Chris@42 1074 T4C = T4z + T4y;
Chris@42 1075 Rm[WS(rs, 8)] = T4B + T4C;
Chris@42 1076 Rm[0] = T4C - T4B;
Chris@42 1077 T4A = T4y - T4z;
Chris@42 1078 Rp[WS(rs, 3)] = T4t + T4A;
Chris@42 1079 Rp[WS(rs, 7)] = T4A - T4t;
Chris@42 1080 }
Chris@42 1081 {
Chris@42 1082 E T5k, T5m, T3o, T3j, T5b, T5c, T5l, T5d;
Chris@42 1083 {
Chris@42 1084 E T5g, T5j, T37, T3i;
Chris@42 1085 T5g = T5e - T5f;
Chris@42 1086 T5j = T5h - T5i;
Chris@42 1087 T5k = FNMS(KP293892626, T5j, KP475528258 * T5g);
Chris@42 1088 T5m = FMA(KP293892626, T5g, KP475528258 * T5j);
Chris@42 1089 T3o = T3m + T3n;
Chris@42 1090 T37 = T31 - T36;
Chris@42 1091 T3i = T3c + T3h;
Chris@42 1092 T3j = T37 - T3i;
Chris@42 1093 T5b = FMA(KP500000000, T3o, KP125000000 * T3j);
Chris@42 1094 T5c = KP279508497 * (T3i + T37);
Chris@42 1095 }
Chris@42 1096 Im[WS(rs, 9)] = KP500000000 * (T3j - T3o);
Chris@42 1097 T5l = T5b - T5c;
Chris@42 1098 Ip[WS(rs, 2)] = T5l + T5m;
Chris@42 1099 Im[WS(rs, 1)] = T5m - T5l;
Chris@42 1100 T5d = T5b + T5c;
Chris@42 1101 Ip[WS(rs, 6)] = T5d + T5k;
Chris@42 1102 Im[WS(rs, 5)] = T5k - T5d;
Chris@42 1103 }
Chris@42 1104 {
Chris@42 1105 E T5w, T5x, T5n, T5q, T5r, T5s, T5y, T5t;
Chris@42 1106 {
Chris@42 1107 E T5u, T5v, T5o, T5p;
Chris@42 1108 T5u = T36 + T31;
Chris@42 1109 T5v = T3c - T3h;
Chris@42 1110 T5w = FNMS(KP293892626, T5v, KP475528258 * T5u);
Chris@42 1111 T5x = FMA(KP475528258, T5v, KP293892626 * T5u);
Chris@42 1112 T5n = T50 - T51;
Chris@42 1113 T5o = T5f + T5e;
Chris@42 1114 T5p = T5h + T5i;
Chris@42 1115 T5q = T5o + T5p;
Chris@42 1116 T5r = FNMS(KP125000000, T5q, KP500000000 * T5n);
Chris@42 1117 T5s = KP279508497 * (T5o - T5p);
Chris@42 1118 }
Chris@42 1119 Rm[WS(rs, 9)] = KP500000000 * (T5n + T5q);
Chris@42 1120 T5y = T5s + T5r;
Chris@42 1121 Rp[WS(rs, 6)] = T5x + T5y;
Chris@42 1122 Rm[WS(rs, 5)] = T5y - T5x;
Chris@42 1123 T5t = T5r - T5s;
Chris@42 1124 Rp[WS(rs, 2)] = T5t - T5w;
Chris@42 1125 Rm[WS(rs, 1)] = T5w + T5t;
Chris@42 1126 }
Chris@42 1127 {
Chris@42 1128 E T4U, T4W, T3p, T3w, T4D, T4E, T4V, T4F;
Chris@42 1129 {
Chris@42 1130 E T4M, T4T, T3s, T3v;
Chris@42 1131 T4M = T4I - T4L;
Chris@42 1132 T4T = T4P - T4S;
Chris@42 1133 T4U = FNMS(KP475528258, T4T, KP293892626 * T4M);
Chris@42 1134 T4W = FMA(KP475528258, T4M, KP293892626 * T4T);
Chris@42 1135 T3p = T3n - T3m;
Chris@42 1136 T3s = T3q + T3r;
Chris@42 1137 T3v = T3t + T3u;
Chris@42 1138 T3w = T3s + T3v;
Chris@42 1139 T4D = FNMS(KP125000000, T3w, KP500000000 * T3p);
Chris@42 1140 T4E = KP279508497 * (T3s - T3v);
Chris@42 1141 }
Chris@42 1142 Ip[0] = KP500000000 * (T3p + T3w);
Chris@42 1143 T4V = T4E + T4D;
Chris@42 1144 Ip[WS(rs, 4)] = T4V + T4W;
Chris@42 1145 Im[WS(rs, 3)] = T4W - T4V;
Chris@42 1146 T4F = T4D - T4E;
Chris@42 1147 Ip[WS(rs, 8)] = T4F + T4U;
Chris@42 1148 Im[WS(rs, 7)] = T4U - T4F;
Chris@42 1149 }
Chris@42 1150 {
Chris@42 1151 E T58, T59, T52, T53, T4Z, T54, T5a, T55;
Chris@42 1152 {
Chris@42 1153 E T56, T57, T4X, T4Y;
Chris@42 1154 T56 = T3q - T3r;
Chris@42 1155 T57 = T3t - T3u;
Chris@42 1156 T58 = FMA(KP475528258, T56, KP293892626 * T57);
Chris@42 1157 T59 = FNMS(KP293892626, T56, KP475528258 * T57);
Chris@42 1158 T52 = T50 + T51;
Chris@42 1159 T4X = T4I + T4L;
Chris@42 1160 T4Y = T4P + T4S;
Chris@42 1161 T53 = T4X + T4Y;
Chris@42 1162 T4Z = KP279508497 * (T4X - T4Y);
Chris@42 1163 T54 = FNMS(KP125000000, T53, KP500000000 * T52);
Chris@42 1164 }
Chris@42 1165 Rp[0] = KP500000000 * (T52 + T53);
Chris@42 1166 T5a = T54 - T4Z;
Chris@42 1167 Rp[WS(rs, 8)] = T59 + T5a;
Chris@42 1168 Rm[WS(rs, 7)] = T5a - T59;
Chris@42 1169 T55 = T4Z + T54;
Chris@42 1170 Rp[WS(rs, 4)] = T55 - T58;
Chris@42 1171 Rm[WS(rs, 3)] = T58 + T55;
Chris@42 1172 }
Chris@42 1173 }
Chris@42 1174 }
Chris@42 1175 }
Chris@42 1176 }
Chris@42 1177
Chris@42 1178 static const tw_instr twinstr[] = {
Chris@42 1179 {TW_CEXP, 1, 1},
Chris@42 1180 {TW_CEXP, 1, 3},
Chris@42 1181 {TW_CEXP, 1, 9},
Chris@42 1182 {TW_CEXP, 1, 19},
Chris@42 1183 {TW_NEXT, 1, 0}
Chris@42 1184 };
Chris@42 1185
Chris@42 1186 static const hc2c_desc desc = { 20, "hc2cfdft2_20", twinstr, &GENUS, {244, 108, 72, 0} };
Chris@42 1187
Chris@42 1188 void X(codelet_hc2cfdft2_20) (planner *p) {
Chris@42 1189 X(khc2c_register) (p, hc2cfdft2_20, &desc, HC2C_VIA_DFT);
Chris@42 1190 }
Chris@42 1191 #endif /* HAVE_FMA */