annotate src/fftw-3.3.8/rdft/scalar/r2cf/hc2cfdft_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:13 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cfdft_20 -include rdft/scalar/hc2cf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 286 FP additions, 188 FP multiplications,
Chris@82 32 * (or, 176 additions, 78 multiplications, 110 fused multiply/add),
Chris@82 33 * 153 stack variables, 5 constants, and 80 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cf.h"
Chris@82 36
Chris@82 37 static void hc2cfdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 44 {
Chris@82 45 INT m;
Chris@82 46 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@82 47 E T2E, T4W, T3v, T4k, T2M, T4V, T3w, T4j, T2p, T2T, T5a, T5A, T3o, T3D, T4b;
Chris@82 48 E T4B, T1Y, T2S, T57, T5z, T3h, T3C, T44, T4A, TH, T2P, T50, T5x, T32, T3z;
Chris@82 49 E T3P, T4D, T1o, T2Q, T53, T5w, T39, T3A, T3W, T4E;
Chris@82 50 {
Chris@82 51 E T9, T1V, Tu, T2w, T1, T5, T6, T2Y, T1R, T1T, T1U, T40, T10, T2F, TE;
Chris@82 52 E T2C, T1y, T2m, T4g, TX, T33, TS, TW, Tw, TA, TB, T3L, T2y, T2A, T2B;
Chris@82 53 E T3t, T1q, T1u, T1v, T3d, T2i, T2k, T2l, T48, Tm, Tq, Tr, T3J, T2s, T2u;
Chris@82 54 E T2v, T3r, T1g, T20, T1l, T23, T1h, T3S, T21, T3k, TL, T2H, TQ, T2K, TM;
Chris@82 55 E T35, T2I, T4h, T1I, T1D, T2g, T2f, T46, T2c, T2e, T1E, T3b, T16, T1b, T29;
Chris@82 56 E T28, T3i, T25, T27, T17, T3Q, Tj, Te, T1P, T1O, T3Y, T1L, T1N, Tf, T2W;
Chris@82 57 E T2x, T2D;
Chris@82 58 {
Chris@82 59 E T7, T8, Ts, Tt;
Chris@82 60 T7 = Rp[WS(rs, 9)];
Chris@82 61 T8 = Rm[WS(rs, 9)];
Chris@82 62 T9 = T7 - T8;
Chris@82 63 T1V = T7 + T8;
Chris@82 64 Ts = Rp[WS(rs, 2)];
Chris@82 65 Tt = Rm[WS(rs, 2)];
Chris@82 66 Tu = Ts + Tt;
Chris@82 67 T2w = Ts - Tt;
Chris@82 68 {
Chris@82 69 E T4, T1S, T2, T3;
Chris@82 70 T2 = Ip[WS(rs, 9)];
Chris@82 71 T3 = Im[WS(rs, 9)];
Chris@82 72 T4 = T2 + T3;
Chris@82 73 T1S = T2 - T3;
Chris@82 74 T1 = W[36];
Chris@82 75 T5 = T1 * T4;
Chris@82 76 T6 = W[37];
Chris@82 77 T2Y = T6 * T4;
Chris@82 78 T1R = W[34];
Chris@82 79 T1T = T1R * T1S;
Chris@82 80 T1U = W[35];
Chris@82 81 T40 = T1U * T1S;
Chris@82 82 }
Chris@82 83 }
Chris@82 84 {
Chris@82 85 E TY, TZ, TC, TD;
Chris@82 86 TY = Ip[0];
Chris@82 87 TZ = Im[0];
Chris@82 88 T10 = TY + TZ;
Chris@82 89 T2F = TY - TZ;
Chris@82 90 TC = Rp[WS(rs, 7)];
Chris@82 91 TD = Rm[WS(rs, 7)];
Chris@82 92 TE = TC + TD;
Chris@82 93 T2C = TC - TD;
Chris@82 94 }
Chris@82 95 {
Chris@82 96 E T1w, T1x, TT, TU, TV;
Chris@82 97 T1w = Rp[WS(rs, 1)];
Chris@82 98 T1x = Rm[WS(rs, 1)];
Chris@82 99 T1y = T1w - T1x;
Chris@82 100 T2m = T1w + T1x;
Chris@82 101 TT = Rm[0];
Chris@82 102 TU = Rp[0];
Chris@82 103 TV = TT - TU;
Chris@82 104 T4g = TU + TT;
Chris@82 105 TX = W[0];
Chris@82 106 T33 = TX * TV;
Chris@82 107 TS = W[1];
Chris@82 108 TW = TS * TV;
Chris@82 109 }
Chris@82 110 {
Chris@82 111 E T1d, T1Z, TI, T2G;
Chris@82 112 {
Chris@82 113 E Tz, T2z, Tx, Ty;
Chris@82 114 Tx = Ip[WS(rs, 7)];
Chris@82 115 Ty = Im[WS(rs, 7)];
Chris@82 116 Tz = Tx - Ty;
Chris@82 117 T2z = Tx + Ty;
Chris@82 118 Tw = W[26];
Chris@82 119 TA = Tw * Tz;
Chris@82 120 TB = W[27];
Chris@82 121 T3L = TB * Tz;
Chris@82 122 T2y = W[28];
Chris@82 123 T2A = T2y * T2z;
Chris@82 124 T2B = W[29];
Chris@82 125 T3t = T2B * T2z;
Chris@82 126 }
Chris@82 127 {
Chris@82 128 E T1t, T2j, T1r, T1s;
Chris@82 129 T1r = Ip[WS(rs, 1)];
Chris@82 130 T1s = Im[WS(rs, 1)];
Chris@82 131 T1t = T1r + T1s;
Chris@82 132 T2j = T1r - T1s;
Chris@82 133 T1q = W[4];
Chris@82 134 T1u = T1q * T1t;
Chris@82 135 T1v = W[5];
Chris@82 136 T3d = T1v * T1t;
Chris@82 137 T2i = W[2];
Chris@82 138 T2k = T2i * T2j;
Chris@82 139 T2l = W[3];
Chris@82 140 T48 = T2l * T2j;
Chris@82 141 }
Chris@82 142 {
Chris@82 143 E Tp, T2t, Tn, To;
Chris@82 144 Tn = Ip[WS(rs, 2)];
Chris@82 145 To = Im[WS(rs, 2)];
Chris@82 146 Tp = Tn - To;
Chris@82 147 T2t = Tn + To;
Chris@82 148 Tm = W[6];
Chris@82 149 Tq = Tm * Tp;
Chris@82 150 Tr = W[7];
Chris@82 151 T3J = Tr * Tp;
Chris@82 152 T2s = W[8];
Chris@82 153 T2u = T2s * T2t;
Chris@82 154 T2v = W[9];
Chris@82 155 T3r = T2v * T2t;
Chris@82 156 }
Chris@82 157 {
Chris@82 158 E T1e, T1f, T1j, T1k;
Chris@82 159 T1e = Ip[WS(rs, 3)];
Chris@82 160 T1f = Im[WS(rs, 3)];
Chris@82 161 T1g = T1e - T1f;
Chris@82 162 T20 = T1e + T1f;
Chris@82 163 T1j = Rp[WS(rs, 3)];
Chris@82 164 T1k = Rm[WS(rs, 3)];
Chris@82 165 T1l = T1j + T1k;
Chris@82 166 T23 = T1j - T1k;
Chris@82 167 }
Chris@82 168 T1d = W[10];
Chris@82 169 T1h = T1d * T1g;
Chris@82 170 T3S = T1d * T1l;
Chris@82 171 T1Z = W[12];
Chris@82 172 T21 = T1Z * T20;
Chris@82 173 T3k = T1Z * T23;
Chris@82 174 {
Chris@82 175 E TJ, TK, TO, TP;
Chris@82 176 TJ = Ip[WS(rs, 5)];
Chris@82 177 TK = Im[WS(rs, 5)];
Chris@82 178 TL = TJ + TK;
Chris@82 179 T2H = TJ - TK;
Chris@82 180 TO = Rp[WS(rs, 5)];
Chris@82 181 TP = Rm[WS(rs, 5)];
Chris@82 182 TQ = TO - TP;
Chris@82 183 T2K = TO + TP;
Chris@82 184 }
Chris@82 185 TI = W[20];
Chris@82 186 TM = TI * TL;
Chris@82 187 T35 = TI * TQ;
Chris@82 188 T2G = W[18];
Chris@82 189 T2I = T2G * T2H;
Chris@82 190 T4h = T2G * T2K;
Chris@82 191 {
Chris@82 192 E T1G, T1H, T2d, T1B, T1C, T1A;
Chris@82 193 T1G = Rm[WS(rs, 6)];
Chris@82 194 T1H = Rp[WS(rs, 6)];
Chris@82 195 T1I = T1G - T1H;
Chris@82 196 T1B = Ip[WS(rs, 6)];
Chris@82 197 T1C = Im[WS(rs, 6)];
Chris@82 198 T1D = T1B + T1C;
Chris@82 199 T2d = T1B - T1C;
Chris@82 200 T2g = T1H + T1G;
Chris@82 201 T2f = W[23];
Chris@82 202 T46 = T2f * T2d;
Chris@82 203 T2c = W[22];
Chris@82 204 T2e = T2c * T2d;
Chris@82 205 T1A = W[24];
Chris@82 206 T1E = T1A * T1D;
Chris@82 207 T3b = T1A * T1I;
Chris@82 208 }
Chris@82 209 {
Chris@82 210 E T14, T15, T26, T19, T1a, T13;
Chris@82 211 T14 = Ip[WS(rs, 8)];
Chris@82 212 T15 = Im[WS(rs, 8)];
Chris@82 213 T16 = T14 - T15;
Chris@82 214 T19 = Rp[WS(rs, 8)];
Chris@82 215 T1a = Rm[WS(rs, 8)];
Chris@82 216 T1b = T19 + T1a;
Chris@82 217 T26 = T1a - T19;
Chris@82 218 T29 = T14 + T15;
Chris@82 219 T28 = W[32];
Chris@82 220 T3i = T28 * T26;
Chris@82 221 T25 = W[33];
Chris@82 222 T27 = T25 * T26;
Chris@82 223 T13 = W[30];
Chris@82 224 T17 = T13 * T16;
Chris@82 225 T3Q = T13 * T1b;
Chris@82 226 }
Chris@82 227 {
Chris@82 228 E Th, Ti, T1M, Tc, Td, Tb;
Chris@82 229 Th = Rm[WS(rs, 4)];
Chris@82 230 Ti = Rp[WS(rs, 4)];
Chris@82 231 Tj = Th - Ti;
Chris@82 232 Tc = Ip[WS(rs, 4)];
Chris@82 233 Td = Im[WS(rs, 4)];
Chris@82 234 Te = Tc + Td;
Chris@82 235 T1M = Tc - Td;
Chris@82 236 T1P = Ti + Th;
Chris@82 237 T1O = W[15];
Chris@82 238 T3Y = T1O * T1M;
Chris@82 239 T1L = W[14];
Chris@82 240 T1N = T1L * T1M;
Chris@82 241 Tb = W[16];
Chris@82 242 Tf = Tb * Te;
Chris@82 243 T2W = Tb * Tj;
Chris@82 244 }
Chris@82 245 }
Chris@82 246 T2x = FNMS(T2v, T2w, T2u);
Chris@82 247 T2D = FNMS(T2B, T2C, T2A);
Chris@82 248 T2E = T2x - T2D;
Chris@82 249 T4W = T2x + T2D;
Chris@82 250 {
Chris@82 251 E T3s, T3u, T2L, T4i, T2J;
Chris@82 252 T3s = FMA(T2s, T2w, T3r);
Chris@82 253 T3u = FMA(T2y, T2C, T3t);
Chris@82 254 T3v = T3s + T3u;
Chris@82 255 T4k = T3u - T3s;
Chris@82 256 T2J = W[19];
Chris@82 257 T2L = FNMS(T2J, T2K, T2I);
Chris@82 258 T4i = FMA(T2J, T2H, T4h);
Chris@82 259 T2M = T2F - T2L;
Chris@82 260 T4V = T4g + T4i;
Chris@82 261 T3w = T2L + T2F;
Chris@82 262 T4j = T4g - T4i;
Chris@82 263 }
Chris@82 264 {
Chris@82 265 E T2a, T3j, T24, T3l, T2o, T3n, T4a, T59, T22;
Chris@82 266 T2a = FMA(T28, T29, T27);
Chris@82 267 T3j = FNMS(T25, T29, T3i);
Chris@82 268 T22 = W[13];
Chris@82 269 T24 = FNMS(T22, T23, T21);
Chris@82 270 T3l = FMA(T22, T20, T3k);
Chris@82 271 {
Chris@82 272 E T2h, T2n, T47, T49;
Chris@82 273 T2h = FNMS(T2f, T2g, T2e);
Chris@82 274 T2n = FNMS(T2l, T2m, T2k);
Chris@82 275 T2o = T2h - T2n;
Chris@82 276 T3n = T2h + T2n;
Chris@82 277 T47 = FMA(T2c, T2g, T46);
Chris@82 278 T49 = FMA(T2i, T2m, T48);
Chris@82 279 T4a = T47 - T49;
Chris@82 280 T59 = T47 + T49;
Chris@82 281 }
Chris@82 282 {
Chris@82 283 E T2b, T58, T3m, T45;
Chris@82 284 T2b = T24 - T2a;
Chris@82 285 T2p = T2b - T2o;
Chris@82 286 T2T = T2b + T2o;
Chris@82 287 T58 = T2a + T24;
Chris@82 288 T5a = T58 + T59;
Chris@82 289 T5A = T59 - T58;
Chris@82 290 T3m = T3j - T3l;
Chris@82 291 T3o = T3m - T3n;
Chris@82 292 T3D = T3m + T3n;
Chris@82 293 T45 = T3j + T3l;
Chris@82 294 T4b = T45 + T4a;
Chris@82 295 T4B = T4a - T45;
Chris@82 296 }
Chris@82 297 }
Chris@82 298 {
Chris@82 299 E T1z, T3e, T1J, T3c, T1X, T3g, T42, T55, T1F;
Chris@82 300 T1z = FNMS(T1v, T1y, T1u);
Chris@82 301 T3e = FMA(T1q, T1y, T3d);
Chris@82 302 T1F = W[25];
Chris@82 303 T1J = FMA(T1F, T1I, T1E);
Chris@82 304 T3c = FNMS(T1F, T1D, T3b);
Chris@82 305 {
Chris@82 306 E T1Q, T1W, T3Z, T41;
Chris@82 307 T1Q = FNMS(T1O, T1P, T1N);
Chris@82 308 T1W = FNMS(T1U, T1V, T1T);
Chris@82 309 T1X = T1Q - T1W;
Chris@82 310 T3g = T1Q + T1W;
Chris@82 311 T3Z = FMA(T1L, T1P, T3Y);
Chris@82 312 T41 = FMA(T1R, T1V, T40);
Chris@82 313 T42 = T3Z - T41;
Chris@82 314 T55 = T3Z + T41;
Chris@82 315 }
Chris@82 316 {
Chris@82 317 E T1K, T56, T3f, T43;
Chris@82 318 T1K = T1z - T1J;
Chris@82 319 T1Y = T1K - T1X;
Chris@82 320 T2S = T1X + T1K;
Chris@82 321 T56 = T1J + T1z;
Chris@82 322 T57 = T55 + T56;
Chris@82 323 T5z = T55 - T56;
Chris@82 324 T3f = T3c - T3e;
Chris@82 325 T3h = T3f - T3g;
Chris@82 326 T3C = T3g + T3f;
Chris@82 327 T43 = T3c + T3e;
Chris@82 328 T44 = T42 + T43;
Chris@82 329 T4A = T42 - T43;
Chris@82 330 }
Chris@82 331 }
Chris@82 332 {
Chris@82 333 E Ta, T2Z, Tk, T2X, TG, T31, T3N, T4Y, Tg;
Chris@82 334 Ta = FNMS(T6, T9, T5);
Chris@82 335 T2Z = FMA(T1, T9, T2Y);
Chris@82 336 Tg = W[17];
Chris@82 337 Tk = FMA(Tg, Tj, Tf);
Chris@82 338 T2X = FNMS(Tg, Te, T2W);
Chris@82 339 {
Chris@82 340 E Tv, TF, T3K, T3M;
Chris@82 341 Tv = FNMS(Tr, Tu, Tq);
Chris@82 342 TF = FNMS(TB, TE, TA);
Chris@82 343 TG = Tv - TF;
Chris@82 344 T31 = Tv + TF;
Chris@82 345 T3K = FMA(Tm, Tu, T3J);
Chris@82 346 T3M = FMA(Tw, TE, T3L);
Chris@82 347 T3N = T3K - T3M;
Chris@82 348 T4Y = T3K + T3M;
Chris@82 349 }
Chris@82 350 {
Chris@82 351 E Tl, T4Z, T30, T3O;
Chris@82 352 Tl = Ta - Tk;
Chris@82 353 TH = Tl - TG;
Chris@82 354 T2P = TG + Tl;
Chris@82 355 T4Z = Tk + Ta;
Chris@82 356 T50 = T4Y + T4Z;
Chris@82 357 T5x = T4Y - T4Z;
Chris@82 358 T30 = T2X - T2Z;
Chris@82 359 T32 = T30 - T31;
Chris@82 360 T3z = T31 + T30;
Chris@82 361 T3O = T2X + T2Z;
Chris@82 362 T3P = T3N + T3O;
Chris@82 363 T4D = T3N - T3O;
Chris@82 364 }
Chris@82 365 }
Chris@82 366 {
Chris@82 367 E T11, T34, TR, T36, T1c, T3R, T1m, T3T, TN, T18, T1i;
Chris@82 368 T11 = FMA(TX, T10, TW);
Chris@82 369 T34 = FNMS(TS, T10, T33);
Chris@82 370 TN = W[21];
Chris@82 371 TR = FNMS(TN, TQ, TM);
Chris@82 372 T36 = FMA(TN, TL, T35);
Chris@82 373 T18 = W[31];
Chris@82 374 T1c = FNMS(T18, T1b, T17);
Chris@82 375 T3R = FMA(T18, T16, T3Q);
Chris@82 376 T1i = W[11];
Chris@82 377 T1m = FNMS(T1i, T1l, T1h);
Chris@82 378 T3T = FMA(T1i, T1g, T3S);
Chris@82 379 {
Chris@82 380 E T12, T1n, T51, T52;
Chris@82 381 T12 = TR - T11;
Chris@82 382 T1n = T1c - T1m;
Chris@82 383 T1o = T12 - T1n;
Chris@82 384 T2Q = T1n + T12;
Chris@82 385 T51 = T3R + T3T;
Chris@82 386 T52 = TR + T11;
Chris@82 387 T53 = T51 + T52;
Chris@82 388 T5w = T51 - T52;
Chris@82 389 }
Chris@82 390 {
Chris@82 391 E T37, T38, T3U, T3V;
Chris@82 392 T37 = T34 - T36;
Chris@82 393 T38 = T1c + T1m;
Chris@82 394 T39 = T37 - T38;
Chris@82 395 T3A = T38 + T37;
Chris@82 396 T3U = T3R - T3T;
Chris@82 397 T3V = T36 + T34;
Chris@82 398 T3W = T3U + T3V;
Chris@82 399 T4E = T3U - T3V;
Chris@82 400 }
Chris@82 401 }
Chris@82 402 }
Chris@82 403 {
Chris@82 404 E T4G, T4I, T2N, T2r, T4x, T4y, T4H, T4z;
Chris@82 405 {
Chris@82 406 E T4C, T4F, T1p, T2q;
Chris@82 407 T4C = T4A - T4B;
Chris@82 408 T4F = T4D - T4E;
Chris@82 409 T4G = FNMS(KP618033988, T4F, T4C);
Chris@82 410 T4I = FMA(KP618033988, T4C, T4F);
Chris@82 411 T2N = T2E + T2M;
Chris@82 412 T1p = TH + T1o;
Chris@82 413 T2q = T1Y + T2p;
Chris@82 414 T2r = T1p + T2q;
Chris@82 415 T4x = FMA(KP250000000, T2r, T2N);
Chris@82 416 T4y = T1p - T2q;
Chris@82 417 }
Chris@82 418 Im[WS(rs, 4)] = KP500000000 * (T2r - T2N);
Chris@82 419 T4H = FNMS(KP559016994, T4y, T4x);
Chris@82 420 Im[0] = -(KP500000000 * (FMA(KP951056516, T4I, T4H)));
Chris@82 421 Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP951056516, T4I, T4H)));
Chris@82 422 T4z = FMA(KP559016994, T4y, T4x);
Chris@82 423 Ip[WS(rs, 3)] = KP500000000 * (FNMS(KP951056516, T4G, T4z));
Chris@82 424 Ip[WS(rs, 7)] = KP500000000 * (FMA(KP951056516, T4G, T4z));
Chris@82 425 }
Chris@82 426 {
Chris@82 427 E T4S, T4U, T4J, T4M, T4N, T4O, T4T, T4P;
Chris@82 428 {
Chris@82 429 E T4Q, T4R, T4K, T4L;
Chris@82 430 T4Q = T2p - T1Y;
Chris@82 431 T4R = T1o - TH;
Chris@82 432 T4S = FNMS(KP618033988, T4R, T4Q);
Chris@82 433 T4U = FMA(KP618033988, T4Q, T4R);
Chris@82 434 T4J = T4j - T4k;
Chris@82 435 T4K = T4D + T4E;
Chris@82 436 T4L = T4A + T4B;
Chris@82 437 T4M = T4K + T4L;
Chris@82 438 T4N = FNMS(KP250000000, T4M, T4J);
Chris@82 439 T4O = T4K - T4L;
Chris@82 440 }
Chris@82 441 Rm[WS(rs, 4)] = KP500000000 * (T4J + T4M);
Chris@82 442 T4T = FMA(KP559016994, T4O, T4N);
Chris@82 443 Rm[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T4U, T4T));
Chris@82 444 Rm[0] = KP500000000 * (FNMS(KP951056516, T4U, T4T));
Chris@82 445 T4P = FNMS(KP559016994, T4O, T4N);
Chris@82 446 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T4S, T4P));
Chris@82 447 Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T4S, T4P));
Chris@82 448 }
Chris@82 449 {
Chris@82 450 E T4d, T4f, T2O, T2V, T3G, T3H, T4e, T3I;
Chris@82 451 {
Chris@82 452 E T3X, T4c, T2R, T2U;
Chris@82 453 T3X = T3P - T3W;
Chris@82 454 T4c = T44 - T4b;
Chris@82 455 T4d = FMA(KP618033988, T4c, T3X);
Chris@82 456 T4f = FNMS(KP618033988, T3X, T4c);
Chris@82 457 T2O = T2M - T2E;
Chris@82 458 T2R = T2P + T2Q;
Chris@82 459 T2U = T2S + T2T;
Chris@82 460 T2V = T2R + T2U;
Chris@82 461 T3G = FNMS(KP250000000, T2V, T2O);
Chris@82 462 T3H = T2R - T2U;
Chris@82 463 }
Chris@82 464 Ip[WS(rs, 5)] = KP500000000 * (T2O + T2V);
Chris@82 465 T4e = FNMS(KP559016994, T3H, T3G);
Chris@82 466 Im[WS(rs, 2)] = -(KP500000000 * (FMA(KP951056516, T4f, T4e)));
Chris@82 467 Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP951056516, T4f, T4e)));
Chris@82 468 T3I = FMA(KP559016994, T3H, T3G);
Chris@82 469 Ip[WS(rs, 1)] = KP500000000 * (FNMS(KP951056516, T4d, T3I));
Chris@82 470 Ip[WS(rs, 9)] = KP500000000 * (FMA(KP951056516, T4d, T3I));
Chris@82 471 }
Chris@82 472 {
Chris@82 473 E T4u, T4w, T4l, T4o, T4p, T4q, T4v, T4r;
Chris@82 474 {
Chris@82 475 E T4s, T4t, T4m, T4n;
Chris@82 476 T4s = T2P - T2Q;
Chris@82 477 T4t = T2S - T2T;
Chris@82 478 T4u = FMA(KP618033988, T4t, T4s);
Chris@82 479 T4w = FNMS(KP618033988, T4s, T4t);
Chris@82 480 T4l = T4j + T4k;
Chris@82 481 T4m = T3P + T3W;
Chris@82 482 T4n = T44 + T4b;
Chris@82 483 T4o = T4m + T4n;
Chris@82 484 T4p = FNMS(KP250000000, T4o, T4l);
Chris@82 485 T4q = T4m - T4n;
Chris@82 486 }
Chris@82 487 Rp[WS(rs, 5)] = KP500000000 * (T4l + T4o);
Chris@82 488 T4v = FNMS(KP559016994, T4q, T4p);
Chris@82 489 Rm[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T4w, T4v));
Chris@82 490 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T4w, T4v));
Chris@82 491 T4r = FMA(KP559016994, T4q, T4p);
Chris@82 492 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T4u, T4r));
Chris@82 493 Rp[WS(rs, 9)] = KP500000000 * (FNMS(KP951056516, T4u, T4r));
Chris@82 494 }
Chris@82 495 {
Chris@82 496 E T5C, T5E, T3x, T3q, T5t, T5u, T5D, T5v;
Chris@82 497 {
Chris@82 498 E T5y, T5B, T3a, T3p;
Chris@82 499 T5y = T5w - T5x;
Chris@82 500 T5B = T5z - T5A;
Chris@82 501 T5C = FNMS(KP618033988, T5B, T5y);
Chris@82 502 T5E = FMA(KP618033988, T5y, T5B);
Chris@82 503 T3x = T3v + T3w;
Chris@82 504 T3a = T32 + T39;
Chris@82 505 T3p = T3h + T3o;
Chris@82 506 T3q = T3a + T3p;
Chris@82 507 T5t = FMA(KP250000000, T3q, T3x);
Chris@82 508 T5u = T3p - T3a;
Chris@82 509 }
Chris@82 510 Im[WS(rs, 9)] = KP500000000 * (T3q - T3x);
Chris@82 511 T5D = FNMS(KP559016994, T5u, T5t);
Chris@82 512 Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T5E, T5D));
Chris@82 513 Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T5E, T5D)));
Chris@82 514 T5v = FMA(KP559016994, T5u, T5t);
Chris@82 515 Ip[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5C, T5v));
Chris@82 516 Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP951056516, T5C, T5v)));
Chris@82 517 }
Chris@82 518 {
Chris@82 519 E T5O, T5Q, T5F, T5I, T5J, T5K, T5P, T5L;
Chris@82 520 {
Chris@82 521 E T5M, T5N, T5G, T5H;
Chris@82 522 T5M = T3o - T3h;
Chris@82 523 T5N = T39 - T32;
Chris@82 524 T5O = FNMS(KP618033988, T5N, T5M);
Chris@82 525 T5Q = FMA(KP618033988, T5M, T5N);
Chris@82 526 T5F = T4V - T4W;
Chris@82 527 T5G = T5x + T5w;
Chris@82 528 T5H = T5z + T5A;
Chris@82 529 T5I = T5G + T5H;
Chris@82 530 T5J = FNMS(KP250000000, T5I, T5F);
Chris@82 531 T5K = T5G - T5H;
Chris@82 532 }
Chris@82 533 Rm[WS(rs, 9)] = KP500000000 * (T5F + T5I);
Chris@82 534 T5P = FMA(KP559016994, T5K, T5J);
Chris@82 535 Rp[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5Q, T5P));
Chris@82 536 Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP951056516, T5Q, T5P));
Chris@82 537 T5L = FNMS(KP559016994, T5K, T5J);
Chris@82 538 Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T5O, T5L));
Chris@82 539 Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T5O, T5L));
Chris@82 540 }
Chris@82 541 {
Chris@82 542 E T5q, T5s, T3y, T3F, T5l, T5m, T5r, T5n;
Chris@82 543 {
Chris@82 544 E T5o, T5p, T3B, T3E;
Chris@82 545 T5o = T50 - T53;
Chris@82 546 T5p = T5a - T57;
Chris@82 547 T5q = FNMS(KP618033988, T5p, T5o);
Chris@82 548 T5s = FMA(KP618033988, T5o, T5p);
Chris@82 549 T3y = T3w - T3v;
Chris@82 550 T3B = T3z + T3A;
Chris@82 551 T3E = T3C + T3D;
Chris@82 552 T3F = T3B + T3E;
Chris@82 553 T5l = FNMS(KP250000000, T3F, T3y);
Chris@82 554 T5m = T3B - T3E;
Chris@82 555 }
Chris@82 556 Ip[0] = KP500000000 * (T3y + T3F);
Chris@82 557 T5r = FNMS(KP559016994, T5m, T5l);
Chris@82 558 Ip[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5s, T5r));
Chris@82 559 Im[WS(rs, 7)] = -(KP500000000 * (FNMS(KP951056516, T5s, T5r)));
Chris@82 560 T5n = FMA(KP559016994, T5m, T5l);
Chris@82 561 Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T5q, T5n));
Chris@82 562 Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T5q, T5n)));
Chris@82 563 }
Chris@82 564 {
Chris@82 565 E T5i, T5k, T4X, T5c, T5d, T5e, T5j, T5f;
Chris@82 566 {
Chris@82 567 E T5g, T5h, T54, T5b;
Chris@82 568 T5g = T3z - T3A;
Chris@82 569 T5h = T3C - T3D;
Chris@82 570 T5i = FMA(KP618033988, T5h, T5g);
Chris@82 571 T5k = FNMS(KP618033988, T5g, T5h);
Chris@82 572 T4X = T4V + T4W;
Chris@82 573 T54 = T50 + T53;
Chris@82 574 T5b = T57 + T5a;
Chris@82 575 T5c = T54 + T5b;
Chris@82 576 T5d = FNMS(KP250000000, T5c, T4X);
Chris@82 577 T5e = T54 - T5b;
Chris@82 578 }
Chris@82 579 Rp[0] = KP500000000 * (T4X + T5c);
Chris@82 580 T5j = FNMS(KP559016994, T5e, T5d);
Chris@82 581 Rp[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5k, T5j));
Chris@82 582 Rm[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T5k, T5j));
Chris@82 583 T5f = FMA(KP559016994, T5e, T5d);
Chris@82 584 Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T5i, T5f));
Chris@82 585 Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T5i, T5f));
Chris@82 586 }
Chris@82 587 }
Chris@82 588 }
Chris@82 589 }
Chris@82 590
Chris@82 591 static const tw_instr twinstr[] = {
Chris@82 592 {TW_FULL, 1, 20},
Chris@82 593 {TW_NEXT, 1, 0}
Chris@82 594 };
Chris@82 595
Chris@82 596 static const hc2c_desc desc = { 20, "hc2cfdft_20", twinstr, &GENUS, {176, 78, 110, 0} };
Chris@82 597
Chris@82 598 void X(codelet_hc2cfdft_20) (planner *p) {
Chris@82 599 X(khc2c_register) (p, hc2cfdft_20, &desc, HC2C_VIA_DFT);
Chris@82 600 }
Chris@82 601 #else
Chris@82 602
Chris@82 603 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cfdft_20 -include rdft/scalar/hc2cf.h */
Chris@82 604
Chris@82 605 /*
Chris@82 606 * This function contains 286 FP additions, 140 FP multiplications,
Chris@82 607 * (or, 224 additions, 78 multiplications, 62 fused multiply/add),
Chris@82 608 * 98 stack variables, 5 constants, and 80 memory accesses
Chris@82 609 */
Chris@82 610 #include "rdft/scalar/hc2cf.h"
Chris@82 611
Chris@82 612 static void hc2cfdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 613 {
Chris@82 614 DK(KP125000000, +0.125000000000000000000000000000000000000000000);
Chris@82 615 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 616 DK(KP279508497, +0.279508497187473712051146708591409529430077295);
Chris@82 617 DK(KP293892626, +0.293892626146236564584352977319536384298826219);
Chris@82 618 DK(KP475528258, +0.475528258147576786058219666689691071702849317);
Chris@82 619 {
Chris@82 620 INT m;
Chris@82 621 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@82 622 E T12, T2w, T4o, T4V, T2H, T3a, T4y, T4Y, T1z, T2v, T25, T2y, T2s, T2z, T4v;
Chris@82 623 E T4X, T4r, T4U, T3A, T3Z, T2X, T37, T3k, T41, T2M, T39, T3v, T3Y, T2S, T36;
Chris@82 624 E T3p, T42, Td, T4G, T33, T3N, Tw, T4H, T32, T3O;
Chris@82 625 {
Chris@82 626 E T3, T3L, T1x, T2V, Th, Tl, TC, T3g, Tq, Tu, TH, T3h, T7, Tb, T1q;
Chris@82 627 E T2U, TR, T2P, T1F, T3r, T23, T2K, T2f, T3y, T1k, T3m, T2q, T2E, T10, T2Q;
Chris@82 628 E T1K, T3s, T1U, T2J, T2a, T3x, T1b, T3l, T2l, T2D;
Chris@82 629 {
Chris@82 630 E T1, T2, T1s, T1u, T1v, T1w, T1r, T1t;
Chris@82 631 T1 = Ip[0];
Chris@82 632 T2 = Im[0];
Chris@82 633 T1s = T1 + T2;
Chris@82 634 T1u = Rp[0];
Chris@82 635 T1v = Rm[0];
Chris@82 636 T1w = T1u - T1v;
Chris@82 637 T3 = T1 - T2;
Chris@82 638 T3L = T1u + T1v;
Chris@82 639 T1r = W[0];
Chris@82 640 T1t = W[1];
Chris@82 641 T1x = FNMS(T1t, T1w, T1r * T1s);
Chris@82 642 T2V = FMA(T1r, T1w, T1t * T1s);
Chris@82 643 }
Chris@82 644 {
Chris@82 645 E Tf, Tg, Tz, Tj, Tk, TB, Ty, TA;
Chris@82 646 Tf = Ip[WS(rs, 2)];
Chris@82 647 Tg = Im[WS(rs, 2)];
Chris@82 648 Tz = Tf - Tg;
Chris@82 649 Tj = Rp[WS(rs, 2)];
Chris@82 650 Tk = Rm[WS(rs, 2)];
Chris@82 651 TB = Tj + Tk;
Chris@82 652 Th = Tf + Tg;
Chris@82 653 Tl = Tj - Tk;
Chris@82 654 Ty = W[6];
Chris@82 655 TA = W[7];
Chris@82 656 TC = FNMS(TA, TB, Ty * Tz);
Chris@82 657 T3g = FMA(TA, Tz, Ty * TB);
Chris@82 658 }
Chris@82 659 {
Chris@82 660 E To, Tp, TE, Ts, Tt, TG, TD, TF;
Chris@82 661 To = Ip[WS(rs, 7)];
Chris@82 662 Tp = Im[WS(rs, 7)];
Chris@82 663 TE = To - Tp;
Chris@82 664 Ts = Rp[WS(rs, 7)];
Chris@82 665 Tt = Rm[WS(rs, 7)];
Chris@82 666 TG = Ts + Tt;
Chris@82 667 Tq = To + Tp;
Chris@82 668 Tu = Ts - Tt;
Chris@82 669 TD = W[26];
Chris@82 670 TF = W[27];
Chris@82 671 TH = FNMS(TF, TG, TD * TE);
Chris@82 672 T3h = FMA(TF, TE, TD * TG);
Chris@82 673 }
Chris@82 674 {
Chris@82 675 E T5, T6, T1n, T9, Ta, T1p, T1m, T1o;
Chris@82 676 T5 = Ip[WS(rs, 5)];
Chris@82 677 T6 = Im[WS(rs, 5)];
Chris@82 678 T1n = T5 + T6;
Chris@82 679 T9 = Rp[WS(rs, 5)];
Chris@82 680 Ta = Rm[WS(rs, 5)];
Chris@82 681 T1p = T9 - Ta;
Chris@82 682 T7 = T5 - T6;
Chris@82 683 Tb = T9 + Ta;
Chris@82 684 T1m = W[20];
Chris@82 685 T1o = W[21];
Chris@82 686 T1q = FNMS(T1o, T1p, T1m * T1n);
Chris@82 687 T2U = FMA(T1m, T1p, T1o * T1n);
Chris@82 688 }
Chris@82 689 {
Chris@82 690 E TM, T1C, TQ, T1E;
Chris@82 691 {
Chris@82 692 E TK, TL, TO, TP;
Chris@82 693 TK = Ip[WS(rs, 4)];
Chris@82 694 TL = Im[WS(rs, 4)];
Chris@82 695 TM = TK + TL;
Chris@82 696 T1C = TK - TL;
Chris@82 697 TO = Rp[WS(rs, 4)];
Chris@82 698 TP = Rm[WS(rs, 4)];
Chris@82 699 TQ = TO - TP;
Chris@82 700 T1E = TO + TP;
Chris@82 701 }
Chris@82 702 {
Chris@82 703 E TJ, TN, T1B, T1D;
Chris@82 704 TJ = W[16];
Chris@82 705 TN = W[17];
Chris@82 706 TR = FNMS(TN, TQ, TJ * TM);
Chris@82 707 T2P = FMA(TN, TM, TJ * TQ);
Chris@82 708 T1B = W[14];
Chris@82 709 T1D = W[15];
Chris@82 710 T1F = FNMS(T1D, T1E, T1B * T1C);
Chris@82 711 T3r = FMA(T1D, T1C, T1B * T1E);
Chris@82 712 }
Chris@82 713 }
Chris@82 714 {
Chris@82 715 E T1Y, T2c, T22, T2e;
Chris@82 716 {
Chris@82 717 E T1W, T1X, T20, T21;
Chris@82 718 T1W = Ip[WS(rs, 1)];
Chris@82 719 T1X = Im[WS(rs, 1)];
Chris@82 720 T1Y = T1W + T1X;
Chris@82 721 T2c = T1W - T1X;
Chris@82 722 T20 = Rp[WS(rs, 1)];
Chris@82 723 T21 = Rm[WS(rs, 1)];
Chris@82 724 T22 = T20 - T21;
Chris@82 725 T2e = T20 + T21;
Chris@82 726 }
Chris@82 727 {
Chris@82 728 E T1V, T1Z, T2b, T2d;
Chris@82 729 T1V = W[4];
Chris@82 730 T1Z = W[5];
Chris@82 731 T23 = FNMS(T1Z, T22, T1V * T1Y);
Chris@82 732 T2K = FMA(T1Z, T1Y, T1V * T22);
Chris@82 733 T2b = W[2];
Chris@82 734 T2d = W[3];
Chris@82 735 T2f = FNMS(T2d, T2e, T2b * T2c);
Chris@82 736 T3y = FMA(T2d, T2c, T2b * T2e);
Chris@82 737 }
Chris@82 738 }
Chris@82 739 {
Chris@82 740 E T1f, T2n, T1j, T2p;
Chris@82 741 {
Chris@82 742 E T1d, T1e, T1h, T1i;
Chris@82 743 T1d = Ip[WS(rs, 3)];
Chris@82 744 T1e = Im[WS(rs, 3)];
Chris@82 745 T1f = T1d - T1e;
Chris@82 746 T2n = T1d + T1e;
Chris@82 747 T1h = Rp[WS(rs, 3)];
Chris@82 748 T1i = Rm[WS(rs, 3)];
Chris@82 749 T1j = T1h + T1i;
Chris@82 750 T2p = T1h - T1i;
Chris@82 751 }
Chris@82 752 {
Chris@82 753 E T1c, T1g, T2m, T2o;
Chris@82 754 T1c = W[10];
Chris@82 755 T1g = W[11];
Chris@82 756 T1k = FNMS(T1g, T1j, T1c * T1f);
Chris@82 757 T3m = FMA(T1c, T1j, T1g * T1f);
Chris@82 758 T2m = W[12];
Chris@82 759 T2o = W[13];
Chris@82 760 T2q = FNMS(T2o, T2p, T2m * T2n);
Chris@82 761 T2E = FMA(T2m, T2p, T2o * T2n);
Chris@82 762 }
Chris@82 763 }
Chris@82 764 {
Chris@82 765 E TV, T1H, TZ, T1J;
Chris@82 766 {
Chris@82 767 E TT, TU, TX, TY;
Chris@82 768 TT = Ip[WS(rs, 9)];
Chris@82 769 TU = Im[WS(rs, 9)];
Chris@82 770 TV = TT + TU;
Chris@82 771 T1H = TT - TU;
Chris@82 772 TX = Rp[WS(rs, 9)];
Chris@82 773 TY = Rm[WS(rs, 9)];
Chris@82 774 TZ = TX - TY;
Chris@82 775 T1J = TX + TY;
Chris@82 776 }
Chris@82 777 {
Chris@82 778 E TS, TW, T1G, T1I;
Chris@82 779 TS = W[36];
Chris@82 780 TW = W[37];
Chris@82 781 T10 = FNMS(TW, TZ, TS * TV);
Chris@82 782 T2Q = FMA(TW, TV, TS * TZ);
Chris@82 783 T1G = W[34];
Chris@82 784 T1I = W[35];
Chris@82 785 T1K = FNMS(T1I, T1J, T1G * T1H);
Chris@82 786 T3s = FMA(T1I, T1H, T1G * T1J);
Chris@82 787 }
Chris@82 788 }
Chris@82 789 {
Chris@82 790 E T1P, T27, T1T, T29;
Chris@82 791 {
Chris@82 792 E T1N, T1O, T1R, T1S;
Chris@82 793 T1N = Ip[WS(rs, 6)];
Chris@82 794 T1O = Im[WS(rs, 6)];
Chris@82 795 T1P = T1N + T1O;
Chris@82 796 T27 = T1N - T1O;
Chris@82 797 T1R = Rp[WS(rs, 6)];
Chris@82 798 T1S = Rm[WS(rs, 6)];
Chris@82 799 T1T = T1R - T1S;
Chris@82 800 T29 = T1R + T1S;
Chris@82 801 }
Chris@82 802 {
Chris@82 803 E T1M, T1Q, T26, T28;
Chris@82 804 T1M = W[24];
Chris@82 805 T1Q = W[25];
Chris@82 806 T1U = FNMS(T1Q, T1T, T1M * T1P);
Chris@82 807 T2J = FMA(T1Q, T1P, T1M * T1T);
Chris@82 808 T26 = W[22];
Chris@82 809 T28 = W[23];
Chris@82 810 T2a = FNMS(T28, T29, T26 * T27);
Chris@82 811 T3x = FMA(T28, T27, T26 * T29);
Chris@82 812 }
Chris@82 813 }
Chris@82 814 {
Chris@82 815 E T16, T2k, T1a, T2i;
Chris@82 816 {
Chris@82 817 E T14, T15, T18, T19;
Chris@82 818 T14 = Ip[WS(rs, 8)];
Chris@82 819 T15 = Im[WS(rs, 8)];
Chris@82 820 T16 = T14 - T15;
Chris@82 821 T2k = T14 + T15;
Chris@82 822 T18 = Rp[WS(rs, 8)];
Chris@82 823 T19 = Rm[WS(rs, 8)];
Chris@82 824 T1a = T18 + T19;
Chris@82 825 T2i = T19 - T18;
Chris@82 826 }
Chris@82 827 {
Chris@82 828 E T13, T17, T2h, T2j;
Chris@82 829 T13 = W[30];
Chris@82 830 T17 = W[31];
Chris@82 831 T1b = FNMS(T17, T1a, T13 * T16);
Chris@82 832 T3l = FMA(T13, T1a, T17 * T16);
Chris@82 833 T2h = W[33];
Chris@82 834 T2j = W[32];
Chris@82 835 T2l = FMA(T2h, T2i, T2j * T2k);
Chris@82 836 T2D = FNMS(T2h, T2k, T2j * T2i);
Chris@82 837 }
Chris@82 838 }
Chris@82 839 {
Chris@82 840 E T2g, T2r, T3n, T3o;
Chris@82 841 {
Chris@82 842 E TI, T11, T4m, T4n;
Chris@82 843 TI = TC - TH;
Chris@82 844 T11 = TR - T10;
Chris@82 845 T12 = TI - T11;
Chris@82 846 T2w = TI + T11;
Chris@82 847 T4m = T3g + T3h;
Chris@82 848 T4n = TR + T10;
Chris@82 849 T4o = T4m + T4n;
Chris@82 850 T4V = T4m - T4n;
Chris@82 851 }
Chris@82 852 {
Chris@82 853 E T2F, T2G, T4w, T4x;
Chris@82 854 T2F = T2D - T2E;
Chris@82 855 T2G = T2a + T2f;
Chris@82 856 T2H = T2F - T2G;
Chris@82 857 T3a = T2F + T2G;
Chris@82 858 T4w = T2l + T2q;
Chris@82 859 T4x = T3x + T3y;
Chris@82 860 T4y = T4w + T4x;
Chris@82 861 T4Y = T4x - T4w;
Chris@82 862 }
Chris@82 863 {
Chris@82 864 E T1l, T1y, T1L, T24;
Chris@82 865 T1l = T1b - T1k;
Chris@82 866 T1y = T1q - T1x;
Chris@82 867 T1z = T1l + T1y;
Chris@82 868 T2v = T1y - T1l;
Chris@82 869 T1L = T1F - T1K;
Chris@82 870 T24 = T1U - T23;
Chris@82 871 T25 = T1L - T24;
Chris@82 872 T2y = T1L + T24;
Chris@82 873 }
Chris@82 874 T2g = T2a - T2f;
Chris@82 875 T2r = T2l - T2q;
Chris@82 876 T2s = T2g - T2r;
Chris@82 877 T2z = T2r + T2g;
Chris@82 878 {
Chris@82 879 E T4t, T4u, T4p, T4q;
Chris@82 880 T4t = T3r + T3s;
Chris@82 881 T4u = T1U + T23;
Chris@82 882 T4v = T4t + T4u;
Chris@82 883 T4X = T4t - T4u;
Chris@82 884 T4p = T3l + T3m;
Chris@82 885 T4q = T1q + T1x;
Chris@82 886 T4r = T4p + T4q;
Chris@82 887 T4U = T4p - T4q;
Chris@82 888 }
Chris@82 889 {
Chris@82 890 E T3w, T3z, T2T, T2W;
Chris@82 891 T3w = T2D + T2E;
Chris@82 892 T3z = T3x - T3y;
Chris@82 893 T3A = T3w + T3z;
Chris@82 894 T3Z = T3z - T3w;
Chris@82 895 T2T = T1b + T1k;
Chris@82 896 T2W = T2U + T2V;
Chris@82 897 T2X = T2T + T2W;
Chris@82 898 T37 = T2T - T2W;
Chris@82 899 }
Chris@82 900 {
Chris@82 901 E T3i, T3j, T2I, T2L;
Chris@82 902 T3i = T3g - T3h;
Chris@82 903 T3j = T2Q - T2P;
Chris@82 904 T3k = T3i + T3j;
Chris@82 905 T41 = T3i - T3j;
Chris@82 906 T2I = T1F + T1K;
Chris@82 907 T2L = T2J + T2K;
Chris@82 908 T2M = T2I + T2L;
Chris@82 909 T39 = T2I - T2L;
Chris@82 910 }
Chris@82 911 {
Chris@82 912 E T3t, T3u, T2O, T2R;
Chris@82 913 T3t = T3r - T3s;
Chris@82 914 T3u = T2K - T2J;
Chris@82 915 T3v = T3t + T3u;
Chris@82 916 T3Y = T3t - T3u;
Chris@82 917 T2O = TC + TH;
Chris@82 918 T2R = T2P + T2Q;
Chris@82 919 T2S = T2O + T2R;
Chris@82 920 T36 = T2O - T2R;
Chris@82 921 }
Chris@82 922 T3n = T3l - T3m;
Chris@82 923 T3o = T2U - T2V;
Chris@82 924 T3p = T3n + T3o;
Chris@82 925 T42 = T3n - T3o;
Chris@82 926 {
Chris@82 927 E Tc, T3M, T4, T8;
Chris@82 928 T4 = W[18];
Chris@82 929 T8 = W[19];
Chris@82 930 Tc = FNMS(T8, Tb, T4 * T7);
Chris@82 931 T3M = FMA(T4, Tb, T8 * T7);
Chris@82 932 Td = T3 - Tc;
Chris@82 933 T4G = T3L + T3M;
Chris@82 934 T33 = Tc + T3;
Chris@82 935 T3N = T3L - T3M;
Chris@82 936 }
Chris@82 937 {
Chris@82 938 E Tm, T30, Tv, T31;
Chris@82 939 {
Chris@82 940 E Te, Ti, Tn, Tr;
Chris@82 941 Te = W[8];
Chris@82 942 Ti = W[9];
Chris@82 943 Tm = FNMS(Ti, Tl, Te * Th);
Chris@82 944 T30 = FMA(Ti, Th, Te * Tl);
Chris@82 945 Tn = W[28];
Chris@82 946 Tr = W[29];
Chris@82 947 Tv = FNMS(Tr, Tu, Tn * Tq);
Chris@82 948 T31 = FMA(Tr, Tq, Tn * Tu);
Chris@82 949 }
Chris@82 950 Tw = Tm - Tv;
Chris@82 951 T4H = Tm + Tv;
Chris@82 952 T32 = T30 + T31;
Chris@82 953 T3O = T31 - T30;
Chris@82 954 }
Chris@82 955 }
Chris@82 956 }
Chris@82 957 {
Chris@82 958 E T3C, T3E, Tx, T2u, T3d, T3e, T3D, T3f;
Chris@82 959 {
Chris@82 960 E T3q, T3B, T1A, T2t;
Chris@82 961 T3q = T3k - T3p;
Chris@82 962 T3B = T3v - T3A;
Chris@82 963 T3C = FMA(KP475528258, T3q, KP293892626 * T3B);
Chris@82 964 T3E = FNMS(KP293892626, T3q, KP475528258 * T3B);
Chris@82 965 Tx = Td - Tw;
Chris@82 966 T1A = T12 + T1z;
Chris@82 967 T2t = T25 + T2s;
Chris@82 968 T2u = T1A + T2t;
Chris@82 969 T3d = KP279508497 * (T1A - T2t);
Chris@82 970 T3e = FNMS(KP125000000, T2u, KP500000000 * Tx);
Chris@82 971 }
Chris@82 972 Ip[WS(rs, 5)] = KP500000000 * (Tx + T2u);
Chris@82 973 T3D = T3d - T3e;
Chris@82 974 Im[WS(rs, 2)] = T3D - T3E;
Chris@82 975 Im[WS(rs, 6)] = T3D + T3E;
Chris@82 976 T3f = T3d + T3e;
Chris@82 977 Ip[WS(rs, 1)] = T3f - T3C;
Chris@82 978 Ip[WS(rs, 9)] = T3f + T3C;
Chris@82 979 }
Chris@82 980 {
Chris@82 981 E T3H, T3T, T3P, T3Q, T3K, T3R, T3U, T3S;
Chris@82 982 {
Chris@82 983 E T3F, T3G, T3I, T3J;
Chris@82 984 T3F = T12 - T1z;
Chris@82 985 T3G = T25 - T2s;
Chris@82 986 T3H = FMA(KP475528258, T3F, KP293892626 * T3G);
Chris@82 987 T3T = FNMS(KP293892626, T3F, KP475528258 * T3G);
Chris@82 988 T3P = T3N + T3O;
Chris@82 989 T3I = T3k + T3p;
Chris@82 990 T3J = T3v + T3A;
Chris@82 991 T3Q = T3I + T3J;
Chris@82 992 T3K = KP279508497 * (T3I - T3J);
Chris@82 993 T3R = FNMS(KP125000000, T3Q, KP500000000 * T3P);
Chris@82 994 }
Chris@82 995 Rp[WS(rs, 5)] = KP500000000 * (T3P + T3Q);
Chris@82 996 T3U = T3R - T3K;
Chris@82 997 Rm[WS(rs, 6)] = T3T + T3U;
Chris@82 998 Rm[WS(rs, 2)] = T3U - T3T;
Chris@82 999 T3S = T3K + T3R;
Chris@82 1000 Rp[WS(rs, 1)] = T3H + T3S;
Chris@82 1001 Rp[WS(rs, 9)] = T3S - T3H;
Chris@82 1002 }
Chris@82 1003 {
Chris@82 1004 E T44, T46, T2C, T2B, T3V, T3W, T45, T3X;
Chris@82 1005 {
Chris@82 1006 E T40, T43, T2x, T2A;
Chris@82 1007 T40 = T3Y - T3Z;
Chris@82 1008 T43 = T41 - T42;
Chris@82 1009 T44 = FNMS(KP293892626, T43, KP475528258 * T40);
Chris@82 1010 T46 = FMA(KP475528258, T43, KP293892626 * T40);
Chris@82 1011 T2C = Tw + Td;
Chris@82 1012 T2x = T2v - T2w;
Chris@82 1013 T2A = T2y + T2z;
Chris@82 1014 T2B = T2x - T2A;
Chris@82 1015 T3V = FMA(KP500000000, T2C, KP125000000 * T2B);
Chris@82 1016 T3W = KP279508497 * (T2x + T2A);
Chris@82 1017 }
Chris@82 1018 Im[WS(rs, 4)] = KP500000000 * (T2B - T2C);
Chris@82 1019 T45 = T3W - T3V;
Chris@82 1020 Im[0] = T45 - T46;
Chris@82 1021 Im[WS(rs, 8)] = T45 + T46;
Chris@82 1022 T3X = T3V + T3W;
Chris@82 1023 Ip[WS(rs, 3)] = T3X - T44;
Chris@82 1024 Ip[WS(rs, 7)] = T3X + T44;
Chris@82 1025 }
Chris@82 1026 {
Chris@82 1027 E T49, T4h, T4a, T4d, T4e, T4f, T4i, T4g;
Chris@82 1028 {
Chris@82 1029 E T47, T48, T4b, T4c;
Chris@82 1030 T47 = T2y - T2z;
Chris@82 1031 T48 = T2w + T2v;
Chris@82 1032 T49 = FNMS(KP293892626, T48, KP475528258 * T47);
Chris@82 1033 T4h = FMA(KP475528258, T48, KP293892626 * T47);
Chris@82 1034 T4a = T3N - T3O;
Chris@82 1035 T4b = T41 + T42;
Chris@82 1036 T4c = T3Y + T3Z;
Chris@82 1037 T4d = T4b + T4c;
Chris@82 1038 T4e = FNMS(KP125000000, T4d, KP500000000 * T4a);
Chris@82 1039 T4f = KP279508497 * (T4b - T4c);
Chris@82 1040 }
Chris@82 1041 Rm[WS(rs, 4)] = KP500000000 * (T4a + T4d);
Chris@82 1042 T4i = T4f + T4e;
Chris@82 1043 Rm[WS(rs, 8)] = T4h + T4i;
Chris@82 1044 Rm[0] = T4i - T4h;
Chris@82 1045 T4g = T4e - T4f;
Chris@82 1046 Rp[WS(rs, 3)] = T49 + T4g;
Chris@82 1047 Rp[WS(rs, 7)] = T4g - T49;
Chris@82 1048 }
Chris@82 1049 {
Chris@82 1050 E T50, T52, T34, T2Z, T4R, T4S, T51, T4T;
Chris@82 1051 {
Chris@82 1052 E T4W, T4Z, T2N, T2Y;
Chris@82 1053 T4W = T4U - T4V;
Chris@82 1054 T4Z = T4X - T4Y;
Chris@82 1055 T50 = FNMS(KP293892626, T4Z, KP475528258 * T4W);
Chris@82 1056 T52 = FMA(KP293892626, T4W, KP475528258 * T4Z);
Chris@82 1057 T34 = T32 + T33;
Chris@82 1058 T2N = T2H - T2M;
Chris@82 1059 T2Y = T2S + T2X;
Chris@82 1060 T2Z = T2N - T2Y;
Chris@82 1061 T4R = FMA(KP500000000, T34, KP125000000 * T2Z);
Chris@82 1062 T4S = KP279508497 * (T2Y + T2N);
Chris@82 1063 }
Chris@82 1064 Im[WS(rs, 9)] = KP500000000 * (T2Z - T34);
Chris@82 1065 T51 = T4R - T4S;
Chris@82 1066 Ip[WS(rs, 2)] = T51 + T52;
Chris@82 1067 Im[WS(rs, 1)] = T52 - T51;
Chris@82 1068 T4T = T4R + T4S;
Chris@82 1069 Ip[WS(rs, 6)] = T4T + T50;
Chris@82 1070 Im[WS(rs, 5)] = T50 - T4T;
Chris@82 1071 }
Chris@82 1072 {
Chris@82 1073 E T5c, T5d, T53, T56, T57, T58, T5e, T59;
Chris@82 1074 {
Chris@82 1075 E T5a, T5b, T54, T55;
Chris@82 1076 T5a = T2M + T2H;
Chris@82 1077 T5b = T2S - T2X;
Chris@82 1078 T5c = FNMS(KP293892626, T5b, KP475528258 * T5a);
Chris@82 1079 T5d = FMA(KP475528258, T5b, KP293892626 * T5a);
Chris@82 1080 T53 = T4G - T4H;
Chris@82 1081 T54 = T4V + T4U;
Chris@82 1082 T55 = T4X + T4Y;
Chris@82 1083 T56 = T54 + T55;
Chris@82 1084 T57 = FNMS(KP125000000, T56, KP500000000 * T53);
Chris@82 1085 T58 = KP279508497 * (T54 - T55);
Chris@82 1086 }
Chris@82 1087 Rm[WS(rs, 9)] = KP500000000 * (T53 + T56);
Chris@82 1088 T5e = T58 + T57;
Chris@82 1089 Rp[WS(rs, 6)] = T5d + T5e;
Chris@82 1090 Rm[WS(rs, 5)] = T5e - T5d;
Chris@82 1091 T59 = T57 - T58;
Chris@82 1092 Rp[WS(rs, 2)] = T59 - T5c;
Chris@82 1093 Rm[WS(rs, 1)] = T5c + T59;
Chris@82 1094 }
Chris@82 1095 {
Chris@82 1096 E T4A, T4C, T35, T3c, T4j, T4k, T4B, T4l;
Chris@82 1097 {
Chris@82 1098 E T4s, T4z, T38, T3b;
Chris@82 1099 T4s = T4o - T4r;
Chris@82 1100 T4z = T4v - T4y;
Chris@82 1101 T4A = FNMS(KP475528258, T4z, KP293892626 * T4s);
Chris@82 1102 T4C = FMA(KP475528258, T4s, KP293892626 * T4z);
Chris@82 1103 T35 = T33 - T32;
Chris@82 1104 T38 = T36 + T37;
Chris@82 1105 T3b = T39 + T3a;
Chris@82 1106 T3c = T38 + T3b;
Chris@82 1107 T4j = FNMS(KP125000000, T3c, KP500000000 * T35);
Chris@82 1108 T4k = KP279508497 * (T38 - T3b);
Chris@82 1109 }
Chris@82 1110 Ip[0] = KP500000000 * (T35 + T3c);
Chris@82 1111 T4B = T4k + T4j;
Chris@82 1112 Ip[WS(rs, 4)] = T4B + T4C;
Chris@82 1113 Im[WS(rs, 3)] = T4C - T4B;
Chris@82 1114 T4l = T4j - T4k;
Chris@82 1115 Ip[WS(rs, 8)] = T4l + T4A;
Chris@82 1116 Im[WS(rs, 7)] = T4A - T4l;
Chris@82 1117 }
Chris@82 1118 {
Chris@82 1119 E T4O, T4P, T4I, T4J, T4F, T4K, T4Q, T4L;
Chris@82 1120 {
Chris@82 1121 E T4M, T4N, T4D, T4E;
Chris@82 1122 T4M = T36 - T37;
Chris@82 1123 T4N = T39 - T3a;
Chris@82 1124 T4O = FMA(KP475528258, T4M, KP293892626 * T4N);
Chris@82 1125 T4P = FNMS(KP293892626, T4M, KP475528258 * T4N);
Chris@82 1126 T4I = T4G + T4H;
Chris@82 1127 T4D = T4o + T4r;
Chris@82 1128 T4E = T4v + T4y;
Chris@82 1129 T4J = T4D + T4E;
Chris@82 1130 T4F = KP279508497 * (T4D - T4E);
Chris@82 1131 T4K = FNMS(KP125000000, T4J, KP500000000 * T4I);
Chris@82 1132 }
Chris@82 1133 Rp[0] = KP500000000 * (T4I + T4J);
Chris@82 1134 T4Q = T4K - T4F;
Chris@82 1135 Rp[WS(rs, 8)] = T4P + T4Q;
Chris@82 1136 Rm[WS(rs, 7)] = T4Q - T4P;
Chris@82 1137 T4L = T4F + T4K;
Chris@82 1138 Rp[WS(rs, 4)] = T4L - T4O;
Chris@82 1139 Rm[WS(rs, 3)] = T4O + T4L;
Chris@82 1140 }
Chris@82 1141 }
Chris@82 1142 }
Chris@82 1143 }
Chris@82 1144
Chris@82 1145 static const tw_instr twinstr[] = {
Chris@82 1146 {TW_FULL, 1, 20},
Chris@82 1147 {TW_NEXT, 1, 0}
Chris@82 1148 };
Chris@82 1149
Chris@82 1150 static const hc2c_desc desc = { 20, "hc2cfdft_20", twinstr, &GENUS, {224, 78, 62, 0} };
Chris@82 1151
Chris@82 1152 void X(codelet_hc2cfdft_20) (planner *p) {
Chris@82 1153 X(khc2c_register) (p, hc2cfdft_20, &desc, HC2C_VIA_DFT);
Chris@82 1154 }
Chris@82 1155 #endif