annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cbdft2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:52:10 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cbdft2_20 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 286 FP additions, 148 FP multiplications,
Chris@42 32 * (or, 176 additions, 38 multiplications, 110 fused multiply/add),
Chris@42 33 * 122 stack variables, 4 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cbdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 46 E T5s, T5v, T5t, T5z, T5q, T5y, T5u, T5A, T5w;
Chris@42 47 {
Chris@42 48 E T3T, T27, T2o, T41, T2p, T40, TU, T15, T2Q, T1N, T2L, T1w, T59, T4n, T5e;
Chris@42 49 E T4A, T2m, T24, T2Z, T2h, T4J, T3P, T3Y, T3W, T2d, TJ, T3H, T2c, TD, T52;
Chris@42 50 E T3G, T1E, T4f, T5I, T4e, T4w, T5L, T4v, T1J, T1H;
Chris@42 51 {
Chris@42 52 E T1A, T3, T25, TI, TF, T6, T26, T1D, TO, T47, T3z, Te, T1S, T3M, T1e;
Chris@42 53 E T4k, TZ, T4a, T3C, Tt, T1Z, T3J, T1p, T4h, T14, T4b, T3D, TA, T22, T3K;
Chris@42 54 E T1u, T4i, Ti, T1f, Th, T1T, TS, Tj, T1g, T1h;
Chris@42 55 {
Chris@42 56 E T4, T5, T1B, T1C;
Chris@42 57 {
Chris@42 58 E T1, T2, TG, TH;
Chris@42 59 T1 = Rp[0];
Chris@42 60 T2 = Rm[WS(rs, 9)];
Chris@42 61 TG = Ip[0];
Chris@42 62 TH = Im[WS(rs, 9)];
Chris@42 63 T4 = Rp[WS(rs, 5)];
Chris@42 64 T1A = T1 - T2;
Chris@42 65 T3 = T1 + T2;
Chris@42 66 T25 = TG - TH;
Chris@42 67 TI = TG + TH;
Chris@42 68 T5 = Rm[WS(rs, 4)];
Chris@42 69 T1B = Ip[WS(rs, 5)];
Chris@42 70 T1C = Im[WS(rs, 4)];
Chris@42 71 }
Chris@42 72 {
Chris@42 73 E Tq, T1l, Tp, T1X, TY, Tr, T1m, T1n;
Chris@42 74 {
Chris@42 75 E Tb, T1a, Ta, T1Q, TN, Tc, T1b, T1c;
Chris@42 76 {
Chris@42 77 E T8, T9, TL, TM;
Chris@42 78 T8 = Rp[WS(rs, 4)];
Chris@42 79 TF = T4 - T5;
Chris@42 80 T6 = T4 + T5;
Chris@42 81 T26 = T1B - T1C;
Chris@42 82 T1D = T1B + T1C;
Chris@42 83 T9 = Rm[WS(rs, 5)];
Chris@42 84 TL = Ip[WS(rs, 4)];
Chris@42 85 TM = Im[WS(rs, 5)];
Chris@42 86 Tb = Rp[WS(rs, 9)];
Chris@42 87 T1a = T8 - T9;
Chris@42 88 Ta = T8 + T9;
Chris@42 89 T1Q = TL - TM;
Chris@42 90 TN = TL + TM;
Chris@42 91 Tc = Rm[0];
Chris@42 92 T1b = Ip[WS(rs, 9)];
Chris@42 93 T1c = Im[0];
Chris@42 94 }
Chris@42 95 {
Chris@42 96 E Tn, To, TW, TX;
Chris@42 97 Tn = Rp[WS(rs, 8)];
Chris@42 98 {
Chris@42 99 E TK, Td, T1R, T1d;
Chris@42 100 TK = Tb - Tc;
Chris@42 101 Td = Tb + Tc;
Chris@42 102 T1R = T1b - T1c;
Chris@42 103 T1d = T1b + T1c;
Chris@42 104 TO = TK + TN;
Chris@42 105 T47 = TN - TK;
Chris@42 106 T3z = Ta - Td;
Chris@42 107 Te = Ta + Td;
Chris@42 108 T1S = T1Q + T1R;
Chris@42 109 T3M = T1Q - T1R;
Chris@42 110 T1e = T1a - T1d;
Chris@42 111 T4k = T1a + T1d;
Chris@42 112 To = Rm[WS(rs, 1)];
Chris@42 113 }
Chris@42 114 TW = Ip[WS(rs, 8)];
Chris@42 115 TX = Im[WS(rs, 1)];
Chris@42 116 Tq = Rm[WS(rs, 6)];
Chris@42 117 T1l = Tn - To;
Chris@42 118 Tp = Tn + To;
Chris@42 119 T1X = TW - TX;
Chris@42 120 TY = TW + TX;
Chris@42 121 Tr = Rp[WS(rs, 3)];
Chris@42 122 T1m = Im[WS(rs, 6)];
Chris@42 123 T1n = Ip[WS(rs, 3)];
Chris@42 124 }
Chris@42 125 }
Chris@42 126 {
Chris@42 127 E Tx, T1q, Tw, T20, T13, Ty, T1r, T1s;
Chris@42 128 {
Chris@42 129 E Tu, Tv, T11, T12;
Chris@42 130 Tu = Rm[WS(rs, 7)];
Chris@42 131 {
Chris@42 132 E TV, Ts, T1Y, T1o;
Chris@42 133 TV = Tq - Tr;
Chris@42 134 Ts = Tq + Tr;
Chris@42 135 T1Y = T1n - T1m;
Chris@42 136 T1o = T1m + T1n;
Chris@42 137 TZ = TV + TY;
Chris@42 138 T4a = TY - TV;
Chris@42 139 T3C = Tp - Ts;
Chris@42 140 Tt = Tp + Ts;
Chris@42 141 T1Z = T1X + T1Y;
Chris@42 142 T3J = T1X - T1Y;
Chris@42 143 T1p = T1l + T1o;
Chris@42 144 T4h = T1l - T1o;
Chris@42 145 Tv = Rp[WS(rs, 2)];
Chris@42 146 }
Chris@42 147 T11 = Im[WS(rs, 7)];
Chris@42 148 T12 = Ip[WS(rs, 2)];
Chris@42 149 Tx = Rm[WS(rs, 2)];
Chris@42 150 T1q = Tu - Tv;
Chris@42 151 Tw = Tu + Tv;
Chris@42 152 T20 = T12 - T11;
Chris@42 153 T13 = T11 + T12;
Chris@42 154 Ty = Rp[WS(rs, 7)];
Chris@42 155 T1r = Im[WS(rs, 2)];
Chris@42 156 T1s = Ip[WS(rs, 7)];
Chris@42 157 }
Chris@42 158 {
Chris@42 159 E Tf, Tg, TQ, TR;
Chris@42 160 Tf = Rm[WS(rs, 3)];
Chris@42 161 {
Chris@42 162 E T10, Tz, T21, T1t;
Chris@42 163 T10 = Tx - Ty;
Chris@42 164 Tz = Tx + Ty;
Chris@42 165 T21 = T1s - T1r;
Chris@42 166 T1t = T1r + T1s;
Chris@42 167 T14 = T10 - T13;
Chris@42 168 T4b = T10 + T13;
Chris@42 169 T3D = Tw - Tz;
Chris@42 170 TA = Tw + Tz;
Chris@42 171 T22 = T20 + T21;
Chris@42 172 T3K = T20 - T21;
Chris@42 173 T1u = T1q + T1t;
Chris@42 174 T4i = T1q - T1t;
Chris@42 175 Tg = Rp[WS(rs, 6)];
Chris@42 176 }
Chris@42 177 TQ = Im[WS(rs, 3)];
Chris@42 178 TR = Ip[WS(rs, 6)];
Chris@42 179 Ti = Rp[WS(rs, 1)];
Chris@42 180 T1f = Tf - Tg;
Chris@42 181 Th = Tf + Tg;
Chris@42 182 T1T = TR - TQ;
Chris@42 183 TS = TQ + TR;
Chris@42 184 Tj = Rm[WS(rs, 8)];
Chris@42 185 T1g = Ip[WS(rs, 1)];
Chris@42 186 T1h = Im[WS(rs, 8)];
Chris@42 187 }
Chris@42 188 }
Chris@42 189 }
Chris@42 190 }
Chris@42 191 {
Chris@42 192 E T1V, T3N, TB, T3B, Tm, T3E, T1F, T1G, T4t, T4j, T4m, T4s, T4c, T4y, T4z;
Chris@42 193 E T49, T3y, T7;
Chris@42 194 {
Chris@42 195 E TT, T48, T1j, T4l, T3A, Tl;
Chris@42 196 T3T = T25 - T26;
Chris@42 197 T27 = T25 + T26;
Chris@42 198 {
Chris@42 199 E TP, Tk, T1U, T1i;
Chris@42 200 TP = Ti - Tj;
Chris@42 201 Tk = Ti + Tj;
Chris@42 202 T1U = T1g - T1h;
Chris@42 203 T1i = T1g + T1h;
Chris@42 204 TT = TP - TS;
Chris@42 205 T48 = TP + TS;
Chris@42 206 T3A = Th - Tk;
Chris@42 207 Tl = Th + Tk;
Chris@42 208 T1V = T1T + T1U;
Chris@42 209 T3N = T1T - T1U;
Chris@42 210 T1j = T1f - T1i;
Chris@42 211 T4l = T1f + T1i;
Chris@42 212 T2o = Tt - TA;
Chris@42 213 TB = Tt + TA;
Chris@42 214 }
Chris@42 215 T41 = T3z - T3A;
Chris@42 216 T3B = T3z + T3A;
Chris@42 217 Tm = Te + Tl;
Chris@42 218 T2p = Te - Tl;
Chris@42 219 {
Chris@42 220 E T1L, T1M, T1k, T1v;
Chris@42 221 T40 = T3C - T3D;
Chris@42 222 T3E = T3C + T3D;
Chris@42 223 TU = TO + TT;
Chris@42 224 T1L = TO - TT;
Chris@42 225 T1M = TZ - T14;
Chris@42 226 T15 = TZ + T14;
Chris@42 227 T1F = T1e + T1j;
Chris@42 228 T1k = T1e - T1j;
Chris@42 229 T1v = T1p - T1u;
Chris@42 230 T1G = T1p + T1u;
Chris@42 231 T4t = T4h + T4i;
Chris@42 232 T4j = T4h - T4i;
Chris@42 233 T2Q = FNMS(KP618033988, T1L, T1M);
Chris@42 234 T1N = FMA(KP618033988, T1M, T1L);
Chris@42 235 T2L = FNMS(KP618033988, T1k, T1v);
Chris@42 236 T1w = FMA(KP618033988, T1v, T1k);
Chris@42 237 T4m = T4k - T4l;
Chris@42 238 T4s = T4k + T4l;
Chris@42 239 T4c = T4a - T4b;
Chris@42 240 T4y = T4a + T4b;
Chris@42 241 T4z = T47 + T48;
Chris@42 242 T49 = T47 - T48;
Chris@42 243 }
Chris@42 244 }
Chris@42 245 {
Chris@42 246 E T2g, T1W, T23, T2f;
Chris@42 247 T2g = T1S - T1V;
Chris@42 248 T1W = T1S + T1V;
Chris@42 249 T59 = FMA(KP618033988, T4j, T4m);
Chris@42 250 T4n = FNMS(KP618033988, T4m, T4j);
Chris@42 251 T5e = FMA(KP618033988, T4y, T4z);
Chris@42 252 T4A = FNMS(KP618033988, T4z, T4y);
Chris@42 253 T23 = T1Z + T22;
Chris@42 254 T2f = T1Z - T22;
Chris@42 255 {
Chris@42 256 E T3V, T3L, T3O, T3U;
Chris@42 257 T3V = T3J + T3K;
Chris@42 258 T3L = T3J - T3K;
Chris@42 259 T2m = T1W - T23;
Chris@42 260 T24 = T1W + T23;
Chris@42 261 T2Z = FMA(KP618033988, T2f, T2g);
Chris@42 262 T2h = FNMS(KP618033988, T2g, T2f);
Chris@42 263 T3O = T3M - T3N;
Chris@42 264 T3U = T3M + T3N;
Chris@42 265 T3y = T3 - T6;
Chris@42 266 T7 = T3 + T6;
Chris@42 267 T4J = FMA(KP618033988, T3L, T3O);
Chris@42 268 T3P = FNMS(KP618033988, T3O, T3L);
Chris@42 269 T3Y = T3U - T3V;
Chris@42 270 T3W = T3U + T3V;
Chris@42 271 }
Chris@42 272 }
Chris@42 273 {
Chris@42 274 E T46, TC, T3F, T4r, T4d, T4u;
Chris@42 275 TC = Tm + TB;
Chris@42 276 T2d = Tm - TB;
Chris@42 277 TJ = TF + TI;
Chris@42 278 T46 = TI - TF;
Chris@42 279 T3H = T3B - T3E;
Chris@42 280 T3F = T3B + T3E;
Chris@42 281 T2c = FNMS(KP250000000, TC, T7);
Chris@42 282 TD = T7 + TC;
Chris@42 283 T52 = T3y + T3F;
Chris@42 284 T3G = FNMS(KP250000000, T3F, T3y);
Chris@42 285 T4r = T1A + T1D;
Chris@42 286 T1E = T1A - T1D;
Chris@42 287 T4f = T49 - T4c;
Chris@42 288 T4d = T49 + T4c;
Chris@42 289 T5I = T46 + T4d;
Chris@42 290 T4e = FNMS(KP250000000, T4d, T46);
Chris@42 291 T4w = T4s - T4t;
Chris@42 292 T4u = T4s + T4t;
Chris@42 293 T5L = T4u + T4r;
Chris@42 294 T4v = FNMS(KP250000000, T4u, T4r);
Chris@42 295 T1J = T1F - T1G;
Chris@42 296 T1H = T1F + T1G;
Chris@42 297 }
Chris@42 298 }
Chris@42 299 }
Chris@42 300 {
Chris@42 301 E T38, T3b, T39, T3f, T36, T3e, T3a;
Chris@42 302 {
Chris@42 303 E T28, T3r, T3o, T3v, T3p, T2b, T2k, T35, T3l, T2H, T2r, T2j, T2z, T2D, T2G;
Chris@42 304 E T2X, T2F, T2T, T32, T3h, T3k, T31, T3d, T3j, T3t, T1x, T2u, T1O, T2x, T2v;
Chris@42 305 E T1y, T2B, T29, T2J, T2M, T2R, T2N, T2V;
Chris@42 306 {
Chris@42 307 E T2l, T1I, T18, T2q, T34, T17, T16, T3n;
Chris@42 308 T28 = T24 + T27;
Chris@42 309 T2l = FNMS(KP250000000, T24, T27);
Chris@42 310 T3r = T1H + T1E;
Chris@42 311 T1I = FNMS(KP250000000, T1H, T1E);
Chris@42 312 T18 = TU - T15;
Chris@42 313 T16 = TU + T15;
Chris@42 314 T3n = W[8];
Chris@42 315 T2q = FNMS(KP618033988, T2p, T2o);
Chris@42 316 T34 = FMA(KP618033988, T2o, T2p);
Chris@42 317 T17 = FNMS(KP250000000, T16, TJ);
Chris@42 318 T3o = TJ + T16;
Chris@42 319 T3v = T3n * T3r;
Chris@42 320 T3p = T3n * T3o;
Chris@42 321 {
Chris@42 322 E T2Y, T2E, T3i, T30;
Chris@42 323 {
Chris@42 324 E T2e, T33, T2n, T2i;
Chris@42 325 T2Y = FMA(KP559016994, T2d, T2c);
Chris@42 326 T2e = FNMS(KP559016994, T2d, T2c);
Chris@42 327 T2b = W[14];
Chris@42 328 T2k = W[15];
Chris@42 329 T33 = FMA(KP559016994, T2m, T2l);
Chris@42 330 T2n = FNMS(KP559016994, T2m, T2l);
Chris@42 331 T2E = FMA(KP951056516, T2h, T2e);
Chris@42 332 T2i = FNMS(KP951056516, T2h, T2e);
Chris@42 333 T35 = FMA(KP951056516, T34, T33);
Chris@42 334 T3l = FNMS(KP951056516, T34, T33);
Chris@42 335 T2H = FNMS(KP951056516, T2q, T2n);
Chris@42 336 T2r = FMA(KP951056516, T2q, T2n);
Chris@42 337 T2j = T2b * T2i;
Chris@42 338 T2z = T2k * T2i;
Chris@42 339 T2D = W[22];
Chris@42 340 T2G = W[23];
Chris@42 341 }
Chris@42 342 T2X = W[30];
Chris@42 343 T2F = T2D * T2E;
Chris@42 344 T2T = T2G * T2E;
Chris@42 345 T3i = FMA(KP951056516, T2Z, T2Y);
Chris@42 346 T30 = FNMS(KP951056516, T2Z, T2Y);
Chris@42 347 T32 = W[31];
Chris@42 348 T3h = W[6];
Chris@42 349 T3k = W[7];
Chris@42 350 T31 = T2X * T30;
Chris@42 351 T3d = T32 * T30;
Chris@42 352 T3j = T3h * T3i;
Chris@42 353 T3t = T3k * T3i;
Chris@42 354 }
Chris@42 355 {
Chris@42 356 E T2K, T2P, TE, T19, T1K, T2t, T37;
Chris@42 357 T2K = FNMS(KP559016994, T18, T17);
Chris@42 358 T19 = FMA(KP559016994, T18, T17);
Chris@42 359 T1K = FMA(KP559016994, T1J, T1I);
Chris@42 360 T2P = FNMS(KP559016994, T1J, T1I);
Chris@42 361 TE = W[0];
Chris@42 362 T2t = W[16];
Chris@42 363 T1x = FMA(KP951056516, T1w, T19);
Chris@42 364 T2u = FNMS(KP951056516, T1w, T19);
Chris@42 365 T1O = FNMS(KP951056516, T1N, T1K);
Chris@42 366 T2x = FMA(KP951056516, T1N, T1K);
Chris@42 367 T2v = T2t * T2u;
Chris@42 368 T1y = TE * T1x;
Chris@42 369 T2B = T2t * T2x;
Chris@42 370 T29 = TE * T1O;
Chris@42 371 T2J = W[24];
Chris@42 372 T37 = W[32];
Chris@42 373 T2M = FMA(KP951056516, T2L, T2K);
Chris@42 374 T38 = FNMS(KP951056516, T2L, T2K);
Chris@42 375 T2R = FNMS(KP951056516, T2Q, T2P);
Chris@42 376 T3b = FMA(KP951056516, T2Q, T2P);
Chris@42 377 T39 = T37 * T38;
Chris@42 378 T2N = T2J * T2M;
Chris@42 379 T3f = T37 * T3b;
Chris@42 380 }
Chris@42 381 }
Chris@42 382 T2V = T2J * T2R;
Chris@42 383 {
Chris@42 384 E T3m, T3u, T3q, T2a, T1P, T1z;
Chris@42 385 T1z = W[1];
Chris@42 386 T3m = FNMS(T3k, T3l, T3j);
Chris@42 387 T3u = FMA(T3h, T3l, T3t);
Chris@42 388 T3q = W[9];
Chris@42 389 T2a = FNMS(T1z, T1x, T29);
Chris@42 390 T1P = FMA(T1z, T1O, T1y);
Chris@42 391 {
Chris@42 392 E T2s, T2A, T2w, T3w, T3s;
Chris@42 393 T2s = FNMS(T2k, T2r, T2j);
Chris@42 394 T3w = FNMS(T3q, T3o, T3v);
Chris@42 395 T3s = FMA(T3q, T3r, T3p);
Chris@42 396 Im[0] = T2a - T28;
Chris@42 397 Ip[0] = T28 + T2a;
Chris@42 398 Rm[0] = TD + T1P;
Chris@42 399 Rp[0] = TD - T1P;
Chris@42 400 Im[WS(rs, 2)] = T3w - T3u;
Chris@42 401 Ip[WS(rs, 2)] = T3u + T3w;
Chris@42 402 Rm[WS(rs, 2)] = T3m + T3s;
Chris@42 403 Rp[WS(rs, 2)] = T3m - T3s;
Chris@42 404 T2A = FMA(T2b, T2r, T2z);
Chris@42 405 T2w = W[17];
Chris@42 406 {
Chris@42 407 E T2I, T2U, T2O, T2C, T2y, T2W, T2S;
Chris@42 408 T2I = FNMS(T2G, T2H, T2F);
Chris@42 409 T2U = FMA(T2D, T2H, T2T);
Chris@42 410 T2O = W[25];
Chris@42 411 T2C = FNMS(T2w, T2u, T2B);
Chris@42 412 T2y = FMA(T2w, T2x, T2v);
Chris@42 413 T36 = FNMS(T32, T35, T31);
Chris@42 414 T2W = FNMS(T2O, T2M, T2V);
Chris@42 415 T2S = FMA(T2O, T2R, T2N);
Chris@42 416 Im[WS(rs, 4)] = T2C - T2A;
Chris@42 417 Ip[WS(rs, 4)] = T2A + T2C;
Chris@42 418 Rm[WS(rs, 4)] = T2s + T2y;
Chris@42 419 Rp[WS(rs, 4)] = T2s - T2y;
Chris@42 420 Im[WS(rs, 6)] = T2W - T2U;
Chris@42 421 Ip[WS(rs, 6)] = T2U + T2W;
Chris@42 422 Rm[WS(rs, 6)] = T2I + T2S;
Chris@42 423 Rp[WS(rs, 6)] = T2I - T2S;
Chris@42 424 T3e = FMA(T2X, T35, T3d);
Chris@42 425 T3a = W[33];
Chris@42 426 }
Chris@42 427 }
Chris@42 428 }
Chris@42 429 }
Chris@42 430 {
Chris@42 431 E T55, T51, T54, T53, T5h, T5P, T5J, T3x, T4P, T5F, T5p, T43, T3R, T3S, T5l;
Chris@42 432 E T5o, T4D, T5n, T5x, T4H, T4M, T5B, T5E, T4L, T4X, T5D, T5N, T4S, T4o, T4V;
Chris@42 433 E T4B, T4T, T4p, T4Z, T4F, T57, T5a, T5f, T5b, T5j;
Chris@42 434 {
Chris@42 435 E T3X, T4O, T42, T3g, T3c, T5H;
Chris@42 436 T55 = T3W + T3T;
Chris@42 437 T3X = FNMS(KP250000000, T3W, T3T);
Chris@42 438 T51 = W[18];
Chris@42 439 T3g = FNMS(T3a, T38, T3f);
Chris@42 440 T3c = FMA(T3a, T3b, T39);
Chris@42 441 T54 = W[19];
Chris@42 442 T53 = T51 * T52;
Chris@42 443 Im[WS(rs, 8)] = T3g - T3e;
Chris@42 444 Ip[WS(rs, 8)] = T3e + T3g;
Chris@42 445 Rm[WS(rs, 8)] = T36 + T3c;
Chris@42 446 Rp[WS(rs, 8)] = T36 - T3c;
Chris@42 447 T5h = T54 * T52;
Chris@42 448 T5H = W[28];
Chris@42 449 T4O = FMA(KP618033988, T40, T41);
Chris@42 450 T42 = FNMS(KP618033988, T41, T40);
Chris@42 451 T5P = T5H * T5L;
Chris@42 452 T5J = T5H * T5I;
Chris@42 453 {
Chris@42 454 E T4I, T5m, T3Q, T3I, T3Z, T4N, T4K, T5C;
Chris@42 455 T3I = FNMS(KP559016994, T3H, T3G);
Chris@42 456 T4I = FMA(KP559016994, T3H, T3G);
Chris@42 457 T3Z = FNMS(KP559016994, T3Y, T3X);
Chris@42 458 T4N = FMA(KP559016994, T3Y, T3X);
Chris@42 459 T3x = W[2];
Chris@42 460 T5m = FNMS(KP951056516, T3P, T3I);
Chris@42 461 T3Q = FMA(KP951056516, T3P, T3I);
Chris@42 462 T4P = FMA(KP951056516, T4O, T4N);
Chris@42 463 T5F = FNMS(KP951056516, T4O, T4N);
Chris@42 464 T5p = FMA(KP951056516, T42, T3Z);
Chris@42 465 T43 = FNMS(KP951056516, T42, T3Z);
Chris@42 466 T3R = T3x * T3Q;
Chris@42 467 T3S = W[3];
Chris@42 468 T5l = W[34];
Chris@42 469 T5o = W[35];
Chris@42 470 T4D = T3S * T3Q;
Chris@42 471 T5n = T5l * T5m;
Chris@42 472 T5x = T5o * T5m;
Chris@42 473 T4K = FNMS(KP951056516, T4J, T4I);
Chris@42 474 T5C = FMA(KP951056516, T4J, T4I);
Chris@42 475 T4H = W[10];
Chris@42 476 T4M = W[11];
Chris@42 477 T5B = W[26];
Chris@42 478 T5E = W[27];
Chris@42 479 T4L = T4H * T4K;
Chris@42 480 T4X = T4M * T4K;
Chris@42 481 T5D = T5B * T5C;
Chris@42 482 T5N = T5E * T5C;
Chris@42 483 }
Chris@42 484 {
Chris@42 485 E T58, T5d, T45, T4g, T4x, T4R, T5r;
Chris@42 486 T4g = FNMS(KP559016994, T4f, T4e);
Chris@42 487 T58 = FMA(KP559016994, T4f, T4e);
Chris@42 488 T5d = FMA(KP559016994, T4w, T4v);
Chris@42 489 T4x = FNMS(KP559016994, T4w, T4v);
Chris@42 490 T45 = W[4];
Chris@42 491 T4R = W[12];
Chris@42 492 T4S = FNMS(KP951056516, T4n, T4g);
Chris@42 493 T4o = FMA(KP951056516, T4n, T4g);
Chris@42 494 T4V = FMA(KP951056516, T4A, T4x);
Chris@42 495 T4B = FNMS(KP951056516, T4A, T4x);
Chris@42 496 T4T = T4R * T4S;
Chris@42 497 T4p = T45 * T4o;
Chris@42 498 T4Z = T4R * T4V;
Chris@42 499 T4F = T45 * T4B;
Chris@42 500 T57 = W[20];
Chris@42 501 T5r = W[36];
Chris@42 502 T5s = FNMS(KP951056516, T59, T58);
Chris@42 503 T5a = FMA(KP951056516, T59, T58);
Chris@42 504 T5v = FMA(KP951056516, T5e, T5d);
Chris@42 505 T5f = FNMS(KP951056516, T5e, T5d);
Chris@42 506 T5t = T5r * T5s;
Chris@42 507 T5b = T57 * T5a;
Chris@42 508 T5z = T5r * T5v;
Chris@42 509 }
Chris@42 510 }
Chris@42 511 T5j = T57 * T5f;
Chris@42 512 {
Chris@42 513 E T44, T4E, T5G, T5O, T5K, T4G, T4C, T4q;
Chris@42 514 T44 = FNMS(T3S, T43, T3R);
Chris@42 515 T4E = FMA(T3x, T43, T4D);
Chris@42 516 T4q = W[5];
Chris@42 517 T5G = FNMS(T5E, T5F, T5D);
Chris@42 518 T5O = FMA(T5B, T5F, T5N);
Chris@42 519 T5K = W[29];
Chris@42 520 T4G = FNMS(T4q, T4o, T4F);
Chris@42 521 T4C = FMA(T4q, T4B, T4p);
Chris@42 522 {
Chris@42 523 E T4Q, T4Y, T4U, T5Q, T5M;
Chris@42 524 T4Q = FNMS(T4M, T4P, T4L);
Chris@42 525 T5Q = FNMS(T5K, T5I, T5P);
Chris@42 526 T5M = FMA(T5K, T5L, T5J);
Chris@42 527 Im[WS(rs, 1)] = T4G - T4E;
Chris@42 528 Ip[WS(rs, 1)] = T4E + T4G;
Chris@42 529 Rm[WS(rs, 1)] = T44 + T4C;
Chris@42 530 Rp[WS(rs, 1)] = T44 - T4C;
Chris@42 531 Im[WS(rs, 7)] = T5Q - T5O;
Chris@42 532 Ip[WS(rs, 7)] = T5O + T5Q;
Chris@42 533 Rm[WS(rs, 7)] = T5G + T5M;
Chris@42 534 Rp[WS(rs, 7)] = T5G - T5M;
Chris@42 535 T4Y = FMA(T4H, T4P, T4X);
Chris@42 536 T4U = W[13];
Chris@42 537 {
Chris@42 538 E T56, T5i, T5c, T50, T4W, T5k, T5g;
Chris@42 539 T56 = FNMS(T54, T55, T53);
Chris@42 540 T5i = FMA(T51, T55, T5h);
Chris@42 541 T5c = W[21];
Chris@42 542 T50 = FNMS(T4U, T4S, T4Z);
Chris@42 543 T4W = FMA(T4U, T4V, T4T);
Chris@42 544 T5q = FNMS(T5o, T5p, T5n);
Chris@42 545 T5k = FNMS(T5c, T5a, T5j);
Chris@42 546 T5g = FMA(T5c, T5f, T5b);
Chris@42 547 Im[WS(rs, 3)] = T50 - T4Y;
Chris@42 548 Ip[WS(rs, 3)] = T4Y + T50;
Chris@42 549 Rm[WS(rs, 3)] = T4Q + T4W;
Chris@42 550 Rp[WS(rs, 3)] = T4Q - T4W;
Chris@42 551 Im[WS(rs, 5)] = T5k - T5i;
Chris@42 552 Ip[WS(rs, 5)] = T5i + T5k;
Chris@42 553 Rm[WS(rs, 5)] = T56 + T5g;
Chris@42 554 Rp[WS(rs, 5)] = T56 - T5g;
Chris@42 555 T5y = FMA(T5l, T5p, T5x);
Chris@42 556 T5u = W[37];
Chris@42 557 }
Chris@42 558 }
Chris@42 559 }
Chris@42 560 }
Chris@42 561 }
Chris@42 562 }
Chris@42 563 T5A = FNMS(T5u, T5s, T5z);
Chris@42 564 T5w = FMA(T5u, T5v, T5t);
Chris@42 565 Im[WS(rs, 9)] = T5A - T5y;
Chris@42 566 Ip[WS(rs, 9)] = T5y + T5A;
Chris@42 567 Rm[WS(rs, 9)] = T5q + T5w;
Chris@42 568 Rp[WS(rs, 9)] = T5q - T5w;
Chris@42 569 }
Chris@42 570 }
Chris@42 571 }
Chris@42 572
Chris@42 573 static const tw_instr twinstr[] = {
Chris@42 574 {TW_FULL, 1, 20},
Chris@42 575 {TW_NEXT, 1, 0}
Chris@42 576 };
Chris@42 577
Chris@42 578 static const hc2c_desc desc = { 20, "hc2cbdft2_20", twinstr, &GENUS, {176, 38, 110, 0} };
Chris@42 579
Chris@42 580 void X(codelet_hc2cbdft2_20) (planner *p) {
Chris@42 581 X(khc2c_register) (p, hc2cbdft2_20, &desc, HC2C_VIA_DFT);
Chris@42 582 }
Chris@42 583 #else /* HAVE_FMA */
Chris@42 584
Chris@42 585 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cbdft2_20 -include hc2cb.h */
Chris@42 586
Chris@42 587 /*
Chris@42 588 * This function contains 286 FP additions, 124 FP multiplications,
Chris@42 589 * (or, 224 additions, 62 multiplications, 62 fused multiply/add),
Chris@42 590 * 89 stack variables, 4 constants, and 80 memory accesses
Chris@42 591 */
Chris@42 592 #include "hc2cb.h"
Chris@42 593
Chris@42 594 static void hc2cbdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 595 {
Chris@42 596 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 597 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 598 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 599 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 600 {
Chris@42 601 INT m;
Chris@42 602 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 603 E T7, T3N, T4a, T16, T1G, T3g, T3D, T26, T1k, T3A, T3B, T1v, T2e, T48, T47;
Chris@42 604 E T2d, T1L, T43, T40, T1K, T2l, T3t, T2m, T3w, T3n, T3p, TC, T2b, T4d, T4f;
Chris@42 605 E T23, T2j, T1B, T1H, T3U, T3W, T3G, T3I, T11, T17;
Chris@42 606 {
Chris@42 607 E T3, T1C, T15, T24, T6, T12, T1F, T25;
Chris@42 608 {
Chris@42 609 E T1, T2, T13, T14;
Chris@42 610 T1 = Rp[0];
Chris@42 611 T2 = Rm[WS(rs, 9)];
Chris@42 612 T3 = T1 + T2;
Chris@42 613 T1C = T1 - T2;
Chris@42 614 T13 = Ip[0];
Chris@42 615 T14 = Im[WS(rs, 9)];
Chris@42 616 T15 = T13 + T14;
Chris@42 617 T24 = T13 - T14;
Chris@42 618 }
Chris@42 619 {
Chris@42 620 E T4, T5, T1D, T1E;
Chris@42 621 T4 = Rp[WS(rs, 5)];
Chris@42 622 T5 = Rm[WS(rs, 4)];
Chris@42 623 T6 = T4 + T5;
Chris@42 624 T12 = T4 - T5;
Chris@42 625 T1D = Ip[WS(rs, 5)];
Chris@42 626 T1E = Im[WS(rs, 4)];
Chris@42 627 T1F = T1D + T1E;
Chris@42 628 T25 = T1D - T1E;
Chris@42 629 }
Chris@42 630 T7 = T3 + T6;
Chris@42 631 T3N = T15 - T12;
Chris@42 632 T4a = T1C + T1F;
Chris@42 633 T16 = T12 + T15;
Chris@42 634 T1G = T1C - T1F;
Chris@42 635 T3g = T3 - T6;
Chris@42 636 T3D = T24 - T25;
Chris@42 637 T26 = T24 + T25;
Chris@42 638 }
Chris@42 639 {
Chris@42 640 E Te, T3O, T3Y, TJ, T1e, T3h, T3r, T1R, TA, T3S, T42, TZ, T1u, T3l, T3v;
Chris@42 641 E T21, Tl, T3P, T3Z, TO, T1j, T3i, T3s, T1U, Tt, T3R, T41, TU, T1p, T3k;
Chris@42 642 E T3u, T1Y;
Chris@42 643 {
Chris@42 644 E Ta, T1a, TI, T1P, Td, TF, T1d, T1Q;
Chris@42 645 {
Chris@42 646 E T8, T9, TG, TH;
Chris@42 647 T8 = Rp[WS(rs, 4)];
Chris@42 648 T9 = Rm[WS(rs, 5)];
Chris@42 649 Ta = T8 + T9;
Chris@42 650 T1a = T8 - T9;
Chris@42 651 TG = Ip[WS(rs, 4)];
Chris@42 652 TH = Im[WS(rs, 5)];
Chris@42 653 TI = TG + TH;
Chris@42 654 T1P = TG - TH;
Chris@42 655 }
Chris@42 656 {
Chris@42 657 E Tb, Tc, T1b, T1c;
Chris@42 658 Tb = Rp[WS(rs, 9)];
Chris@42 659 Tc = Rm[0];
Chris@42 660 Td = Tb + Tc;
Chris@42 661 TF = Tb - Tc;
Chris@42 662 T1b = Ip[WS(rs, 9)];
Chris@42 663 T1c = Im[0];
Chris@42 664 T1d = T1b + T1c;
Chris@42 665 T1Q = T1b - T1c;
Chris@42 666 }
Chris@42 667 Te = Ta + Td;
Chris@42 668 T3O = TI - TF;
Chris@42 669 T3Y = T1a + T1d;
Chris@42 670 TJ = TF + TI;
Chris@42 671 T1e = T1a - T1d;
Chris@42 672 T3h = Ta - Td;
Chris@42 673 T3r = T1P - T1Q;
Chris@42 674 T1R = T1P + T1Q;
Chris@42 675 }
Chris@42 676 {
Chris@42 677 E Tw, T1q, TY, T1Z, Tz, TV, T1t, T20;
Chris@42 678 {
Chris@42 679 E Tu, Tv, TW, TX;
Chris@42 680 Tu = Rm[WS(rs, 7)];
Chris@42 681 Tv = Rp[WS(rs, 2)];
Chris@42 682 Tw = Tu + Tv;
Chris@42 683 T1q = Tu - Tv;
Chris@42 684 TW = Im[WS(rs, 7)];
Chris@42 685 TX = Ip[WS(rs, 2)];
Chris@42 686 TY = TW + TX;
Chris@42 687 T1Z = TX - TW;
Chris@42 688 }
Chris@42 689 {
Chris@42 690 E Tx, Ty, T1r, T1s;
Chris@42 691 Tx = Rm[WS(rs, 2)];
Chris@42 692 Ty = Rp[WS(rs, 7)];
Chris@42 693 Tz = Tx + Ty;
Chris@42 694 TV = Tx - Ty;
Chris@42 695 T1r = Im[WS(rs, 2)];
Chris@42 696 T1s = Ip[WS(rs, 7)];
Chris@42 697 T1t = T1r + T1s;
Chris@42 698 T20 = T1s - T1r;
Chris@42 699 }
Chris@42 700 TA = Tw + Tz;
Chris@42 701 T3S = TV + TY;
Chris@42 702 T42 = T1q - T1t;
Chris@42 703 TZ = TV - TY;
Chris@42 704 T1u = T1q + T1t;
Chris@42 705 T3l = Tw - Tz;
Chris@42 706 T3v = T1Z - T20;
Chris@42 707 T21 = T1Z + T20;
Chris@42 708 }
Chris@42 709 {
Chris@42 710 E Th, T1f, TN, T1S, Tk, TK, T1i, T1T;
Chris@42 711 {
Chris@42 712 E Tf, Tg, TL, TM;
Chris@42 713 Tf = Rm[WS(rs, 3)];
Chris@42 714 Tg = Rp[WS(rs, 6)];
Chris@42 715 Th = Tf + Tg;
Chris@42 716 T1f = Tf - Tg;
Chris@42 717 TL = Im[WS(rs, 3)];
Chris@42 718 TM = Ip[WS(rs, 6)];
Chris@42 719 TN = TL + TM;
Chris@42 720 T1S = TM - TL;
Chris@42 721 }
Chris@42 722 {
Chris@42 723 E Ti, Tj, T1g, T1h;
Chris@42 724 Ti = Rp[WS(rs, 1)];
Chris@42 725 Tj = Rm[WS(rs, 8)];
Chris@42 726 Tk = Ti + Tj;
Chris@42 727 TK = Ti - Tj;
Chris@42 728 T1g = Ip[WS(rs, 1)];
Chris@42 729 T1h = Im[WS(rs, 8)];
Chris@42 730 T1i = T1g + T1h;
Chris@42 731 T1T = T1g - T1h;
Chris@42 732 }
Chris@42 733 Tl = Th + Tk;
Chris@42 734 T3P = TK + TN;
Chris@42 735 T3Z = T1f + T1i;
Chris@42 736 TO = TK - TN;
Chris@42 737 T1j = T1f - T1i;
Chris@42 738 T3i = Th - Tk;
Chris@42 739 T3s = T1S - T1T;
Chris@42 740 T1U = T1S + T1T;
Chris@42 741 }
Chris@42 742 {
Chris@42 743 E Tp, T1l, TT, T1W, Ts, TQ, T1o, T1X;
Chris@42 744 {
Chris@42 745 E Tn, To, TR, TS;
Chris@42 746 Tn = Rp[WS(rs, 8)];
Chris@42 747 To = Rm[WS(rs, 1)];
Chris@42 748 Tp = Tn + To;
Chris@42 749 T1l = Tn - To;
Chris@42 750 TR = Ip[WS(rs, 8)];
Chris@42 751 TS = Im[WS(rs, 1)];
Chris@42 752 TT = TR + TS;
Chris@42 753 T1W = TR - TS;
Chris@42 754 }
Chris@42 755 {
Chris@42 756 E Tq, Tr, T1m, T1n;
Chris@42 757 Tq = Rm[WS(rs, 6)];
Chris@42 758 Tr = Rp[WS(rs, 3)];
Chris@42 759 Ts = Tq + Tr;
Chris@42 760 TQ = Tq - Tr;
Chris@42 761 T1m = Im[WS(rs, 6)];
Chris@42 762 T1n = Ip[WS(rs, 3)];
Chris@42 763 T1o = T1m + T1n;
Chris@42 764 T1X = T1n - T1m;
Chris@42 765 }
Chris@42 766 Tt = Tp + Ts;
Chris@42 767 T3R = TT - TQ;
Chris@42 768 T41 = T1l - T1o;
Chris@42 769 TU = TQ + TT;
Chris@42 770 T1p = T1l + T1o;
Chris@42 771 T3k = Tp - Ts;
Chris@42 772 T3u = T1W - T1X;
Chris@42 773 T1Y = T1W + T1X;
Chris@42 774 }
Chris@42 775 T1k = T1e - T1j;
Chris@42 776 T3A = T3h - T3i;
Chris@42 777 T3B = T3k - T3l;
Chris@42 778 T1v = T1p - T1u;
Chris@42 779 T2e = T1Y - T21;
Chris@42 780 T48 = T3R + T3S;
Chris@42 781 T47 = T3O + T3P;
Chris@42 782 T2d = T1R - T1U;
Chris@42 783 T1L = TU - TZ;
Chris@42 784 T43 = T41 - T42;
Chris@42 785 T40 = T3Y - T3Z;
Chris@42 786 T1K = TJ - TO;
Chris@42 787 T2l = Te - Tl;
Chris@42 788 T3t = T3r - T3s;
Chris@42 789 T2m = Tt - TA;
Chris@42 790 T3w = T3u - T3v;
Chris@42 791 {
Chris@42 792 E T3j, T3m, Tm, TB;
Chris@42 793 T3j = T3h + T3i;
Chris@42 794 T3m = T3k + T3l;
Chris@42 795 T3n = T3j + T3m;
Chris@42 796 T3p = KP559016994 * (T3j - T3m);
Chris@42 797 Tm = Te + Tl;
Chris@42 798 TB = Tt + TA;
Chris@42 799 TC = Tm + TB;
Chris@42 800 T2b = KP559016994 * (Tm - TB);
Chris@42 801 }
Chris@42 802 {
Chris@42 803 E T4b, T4c, T3Q, T3T;
Chris@42 804 T4b = T3Y + T3Z;
Chris@42 805 T4c = T41 + T42;
Chris@42 806 T4d = T4b + T4c;
Chris@42 807 T4f = KP559016994 * (T4b - T4c);
Chris@42 808 {
Chris@42 809 E T1V, T22, T1z, T1A;
Chris@42 810 T1V = T1R + T1U;
Chris@42 811 T22 = T1Y + T21;
Chris@42 812 T23 = T1V + T22;
Chris@42 813 T2j = KP559016994 * (T1V - T22);
Chris@42 814 T1z = T1e + T1j;
Chris@42 815 T1A = T1p + T1u;
Chris@42 816 T1B = KP559016994 * (T1z - T1A);
Chris@42 817 T1H = T1z + T1A;
Chris@42 818 }
Chris@42 819 T3Q = T3O - T3P;
Chris@42 820 T3T = T3R - T3S;
Chris@42 821 T3U = T3Q + T3T;
Chris@42 822 T3W = KP559016994 * (T3Q - T3T);
Chris@42 823 {
Chris@42 824 E T3E, T3F, TP, T10;
Chris@42 825 T3E = T3r + T3s;
Chris@42 826 T3F = T3u + T3v;
Chris@42 827 T3G = T3E + T3F;
Chris@42 828 T3I = KP559016994 * (T3E - T3F);
Chris@42 829 TP = TJ + TO;
Chris@42 830 T10 = TU + TZ;
Chris@42 831 T11 = KP559016994 * (TP - T10);
Chris@42 832 T17 = TP + T10;
Chris@42 833 }
Chris@42 834 }
Chris@42 835 }
Chris@42 836 {
Chris@42 837 E TD, T27, T3c, T3e, T2o, T36, T2A, T2U, T1N, T2Z, T2t, T2J, T1x, T2X, T2r;
Chris@42 838 E T2F, T2g, T34, T2y, T2Q;
Chris@42 839 TD = T7 + TC;
Chris@42 840 T27 = T23 + T26;
Chris@42 841 {
Chris@42 842 E T39, T3b, T38, T3a;
Chris@42 843 T39 = T16 + T17;
Chris@42 844 T3b = T1H + T1G;
Chris@42 845 T38 = W[8];
Chris@42 846 T3a = W[9];
Chris@42 847 T3c = FMA(T38, T39, T3a * T3b);
Chris@42 848 T3e = FNMS(T3a, T39, T38 * T3b);
Chris@42 849 }
Chris@42 850 {
Chris@42 851 E T2n, T2S, T2k, T2T, T2i;
Chris@42 852 T2n = FNMS(KP951056516, T2m, KP587785252 * T2l);
Chris@42 853 T2S = FMA(KP951056516, T2l, KP587785252 * T2m);
Chris@42 854 T2i = FNMS(KP250000000, T23, T26);
Chris@42 855 T2k = T2i - T2j;
Chris@42 856 T2T = T2j + T2i;
Chris@42 857 T2o = T2k - T2n;
Chris@42 858 T36 = T2T - T2S;
Chris@42 859 T2A = T2n + T2k;
Chris@42 860 T2U = T2S + T2T;
Chris@42 861 }
Chris@42 862 {
Chris@42 863 E T1M, T2H, T1J, T2I, T1I;
Chris@42 864 T1M = FMA(KP951056516, T1K, KP587785252 * T1L);
Chris@42 865 T2H = FNMS(KP951056516, T1L, KP587785252 * T1K);
Chris@42 866 T1I = FNMS(KP250000000, T1H, T1G);
Chris@42 867 T1J = T1B + T1I;
Chris@42 868 T2I = T1I - T1B;
Chris@42 869 T1N = T1J - T1M;
Chris@42 870 T2Z = T2I - T2H;
Chris@42 871 T2t = T1M + T1J;
Chris@42 872 T2J = T2H + T2I;
Chris@42 873 }
Chris@42 874 {
Chris@42 875 E T1w, T2E, T19, T2D, T18;
Chris@42 876 T1w = FMA(KP951056516, T1k, KP587785252 * T1v);
Chris@42 877 T2E = FNMS(KP951056516, T1v, KP587785252 * T1k);
Chris@42 878 T18 = FNMS(KP250000000, T17, T16);
Chris@42 879 T19 = T11 + T18;
Chris@42 880 T2D = T18 - T11;
Chris@42 881 T1x = T19 + T1w;
Chris@42 882 T2X = T2D + T2E;
Chris@42 883 T2r = T19 - T1w;
Chris@42 884 T2F = T2D - T2E;
Chris@42 885 }
Chris@42 886 {
Chris@42 887 E T2f, T2P, T2c, T2O, T2a;
Chris@42 888 T2f = FNMS(KP951056516, T2e, KP587785252 * T2d);
Chris@42 889 T2P = FMA(KP951056516, T2d, KP587785252 * T2e);
Chris@42 890 T2a = FNMS(KP250000000, TC, T7);
Chris@42 891 T2c = T2a - T2b;
Chris@42 892 T2O = T2b + T2a;
Chris@42 893 T2g = T2c + T2f;
Chris@42 894 T34 = T2O + T2P;
Chris@42 895 T2y = T2c - T2f;
Chris@42 896 T2Q = T2O - T2P;
Chris@42 897 }
Chris@42 898 {
Chris@42 899 E T1O, T28, TE, T1y;
Chris@42 900 TE = W[0];
Chris@42 901 T1y = W[1];
Chris@42 902 T1O = FMA(TE, T1x, T1y * T1N);
Chris@42 903 T28 = FNMS(T1y, T1x, TE * T1N);
Chris@42 904 Rp[0] = TD - T1O;
Chris@42 905 Ip[0] = T27 + T28;
Chris@42 906 Rm[0] = TD + T1O;
Chris@42 907 Im[0] = T28 - T27;
Chris@42 908 }
Chris@42 909 {
Chris@42 910 E T37, T3d, T33, T35;
Chris@42 911 T33 = W[6];
Chris@42 912 T35 = W[7];
Chris@42 913 T37 = FNMS(T35, T36, T33 * T34);
Chris@42 914 T3d = FMA(T35, T34, T33 * T36);
Chris@42 915 Rp[WS(rs, 2)] = T37 - T3c;
Chris@42 916 Ip[WS(rs, 2)] = T3d + T3e;
Chris@42 917 Rm[WS(rs, 2)] = T37 + T3c;
Chris@42 918 Im[WS(rs, 2)] = T3e - T3d;
Chris@42 919 }
Chris@42 920 {
Chris@42 921 E T2p, T2v, T2u, T2w;
Chris@42 922 {
Chris@42 923 E T29, T2h, T2q, T2s;
Chris@42 924 T29 = W[14];
Chris@42 925 T2h = W[15];
Chris@42 926 T2p = FNMS(T2h, T2o, T29 * T2g);
Chris@42 927 T2v = FMA(T2h, T2g, T29 * T2o);
Chris@42 928 T2q = W[16];
Chris@42 929 T2s = W[17];
Chris@42 930 T2u = FMA(T2q, T2r, T2s * T2t);
Chris@42 931 T2w = FNMS(T2s, T2r, T2q * T2t);
Chris@42 932 }
Chris@42 933 Rp[WS(rs, 4)] = T2p - T2u;
Chris@42 934 Ip[WS(rs, 4)] = T2v + T2w;
Chris@42 935 Rm[WS(rs, 4)] = T2p + T2u;
Chris@42 936 Im[WS(rs, 4)] = T2w - T2v;
Chris@42 937 }
Chris@42 938 {
Chris@42 939 E T2B, T2L, T2K, T2M;
Chris@42 940 {
Chris@42 941 E T2x, T2z, T2C, T2G;
Chris@42 942 T2x = W[22];
Chris@42 943 T2z = W[23];
Chris@42 944 T2B = FNMS(T2z, T2A, T2x * T2y);
Chris@42 945 T2L = FMA(T2z, T2y, T2x * T2A);
Chris@42 946 T2C = W[24];
Chris@42 947 T2G = W[25];
Chris@42 948 T2K = FMA(T2C, T2F, T2G * T2J);
Chris@42 949 T2M = FNMS(T2G, T2F, T2C * T2J);
Chris@42 950 }
Chris@42 951 Rp[WS(rs, 6)] = T2B - T2K;
Chris@42 952 Ip[WS(rs, 6)] = T2L + T2M;
Chris@42 953 Rm[WS(rs, 6)] = T2B + T2K;
Chris@42 954 Im[WS(rs, 6)] = T2M - T2L;
Chris@42 955 }
Chris@42 956 {
Chris@42 957 E T2V, T31, T30, T32;
Chris@42 958 {
Chris@42 959 E T2N, T2R, T2W, T2Y;
Chris@42 960 T2N = W[30];
Chris@42 961 T2R = W[31];
Chris@42 962 T2V = FNMS(T2R, T2U, T2N * T2Q);
Chris@42 963 T31 = FMA(T2R, T2Q, T2N * T2U);
Chris@42 964 T2W = W[32];
Chris@42 965 T2Y = W[33];
Chris@42 966 T30 = FMA(T2W, T2X, T2Y * T2Z);
Chris@42 967 T32 = FNMS(T2Y, T2X, T2W * T2Z);
Chris@42 968 }
Chris@42 969 Rp[WS(rs, 8)] = T2V - T30;
Chris@42 970 Ip[WS(rs, 8)] = T31 + T32;
Chris@42 971 Rm[WS(rs, 8)] = T2V + T30;
Chris@42 972 Im[WS(rs, 8)] = T32 - T31;
Chris@42 973 }
Chris@42 974 }
Chris@42 975 {
Chris@42 976 E T4F, T4P, T5c, T5e, T3y, T54, T4o, T4S, T4h, T4Z, T4x, T4N, T45, T4X, T4v;
Chris@42 977 E T4J, T3K, T56, T4s, T4U;
Chris@42 978 {
Chris@42 979 E T4C, T4E, T4B, T4D;
Chris@42 980 T4C = T3g + T3n;
Chris@42 981 T4E = T3G + T3D;
Chris@42 982 T4B = W[18];
Chris@42 983 T4D = W[19];
Chris@42 984 T4F = FNMS(T4D, T4E, T4B * T4C);
Chris@42 985 T4P = FMA(T4D, T4C, T4B * T4E);
Chris@42 986 }
Chris@42 987 {
Chris@42 988 E T59, T5b, T58, T5a;
Chris@42 989 T59 = T3N + T3U;
Chris@42 990 T5b = T4d + T4a;
Chris@42 991 T58 = W[28];
Chris@42 992 T5a = W[29];
Chris@42 993 T5c = FMA(T58, T59, T5a * T5b);
Chris@42 994 T5e = FNMS(T5a, T59, T58 * T5b);
Chris@42 995 }
Chris@42 996 {
Chris@42 997 E T3x, T4n, T3q, T4m, T3o;
Chris@42 998 T3x = FNMS(KP951056516, T3w, KP587785252 * T3t);
Chris@42 999 T4n = FMA(KP951056516, T3t, KP587785252 * T3w);
Chris@42 1000 T3o = FNMS(KP250000000, T3n, T3g);
Chris@42 1001 T3q = T3o - T3p;
Chris@42 1002 T4m = T3p + T3o;
Chris@42 1003 T3y = T3q - T3x;
Chris@42 1004 T54 = T4m + T4n;
Chris@42 1005 T4o = T4m - T4n;
Chris@42 1006 T4S = T3q + T3x;
Chris@42 1007 }
Chris@42 1008 {
Chris@42 1009 E T49, T4M, T4g, T4L, T4e;
Chris@42 1010 T49 = FNMS(KP951056516, T48, KP587785252 * T47);
Chris@42 1011 T4M = FMA(KP951056516, T47, KP587785252 * T48);
Chris@42 1012 T4e = FNMS(KP250000000, T4d, T4a);
Chris@42 1013 T4g = T4e - T4f;
Chris@42 1014 T4L = T4f + T4e;
Chris@42 1015 T4h = T49 + T4g;
Chris@42 1016 T4Z = T4M + T4L;
Chris@42 1017 T4x = T4g - T49;
Chris@42 1018 T4N = T4L - T4M;
Chris@42 1019 }
Chris@42 1020 {
Chris@42 1021 E T44, T4I, T3X, T4H, T3V;
Chris@42 1022 T44 = FNMS(KP951056516, T43, KP587785252 * T40);
Chris@42 1023 T4I = FMA(KP951056516, T40, KP587785252 * T43);
Chris@42 1024 T3V = FNMS(KP250000000, T3U, T3N);
Chris@42 1025 T3X = T3V - T3W;
Chris@42 1026 T4H = T3W + T3V;
Chris@42 1027 T45 = T3X - T44;
Chris@42 1028 T4X = T4H - T4I;
Chris@42 1029 T4v = T3X + T44;
Chris@42 1030 T4J = T4H + T4I;
Chris@42 1031 }
Chris@42 1032 {
Chris@42 1033 E T3C, T4q, T3J, T4r, T3H;
Chris@42 1034 T3C = FNMS(KP951056516, T3B, KP587785252 * T3A);
Chris@42 1035 T4q = FMA(KP951056516, T3A, KP587785252 * T3B);
Chris@42 1036 T3H = FNMS(KP250000000, T3G, T3D);
Chris@42 1037 T3J = T3H - T3I;
Chris@42 1038 T4r = T3I + T3H;
Chris@42 1039 T3K = T3C + T3J;
Chris@42 1040 T56 = T4r - T4q;
Chris@42 1041 T4s = T4q + T4r;
Chris@42 1042 T4U = T3J - T3C;
Chris@42 1043 }
Chris@42 1044 {
Chris@42 1045 E T4O, T4Q, T4G, T4K;
Chris@42 1046 T4G = W[20];
Chris@42 1047 T4K = W[21];
Chris@42 1048 T4O = FMA(T4G, T4J, T4K * T4N);
Chris@42 1049 T4Q = FNMS(T4K, T4J, T4G * T4N);
Chris@42 1050 Rp[WS(rs, 5)] = T4F - T4O;
Chris@42 1051 Ip[WS(rs, 5)] = T4P + T4Q;
Chris@42 1052 Rm[WS(rs, 5)] = T4F + T4O;
Chris@42 1053 Im[WS(rs, 5)] = T4Q - T4P;
Chris@42 1054 }
Chris@42 1055 {
Chris@42 1056 E T57, T5d, T53, T55;
Chris@42 1057 T53 = W[26];
Chris@42 1058 T55 = W[27];
Chris@42 1059 T57 = FNMS(T55, T56, T53 * T54);
Chris@42 1060 T5d = FMA(T55, T54, T53 * T56);
Chris@42 1061 Rp[WS(rs, 7)] = T57 - T5c;
Chris@42 1062 Ip[WS(rs, 7)] = T5d + T5e;
Chris@42 1063 Rm[WS(rs, 7)] = T57 + T5c;
Chris@42 1064 Im[WS(rs, 7)] = T5e - T5d;
Chris@42 1065 }
Chris@42 1066 {
Chris@42 1067 E T3L, T4j, T4i, T4k;
Chris@42 1068 {
Chris@42 1069 E T3f, T3z, T3M, T46;
Chris@42 1070 T3f = W[2];
Chris@42 1071 T3z = W[3];
Chris@42 1072 T3L = FNMS(T3z, T3K, T3f * T3y);
Chris@42 1073 T4j = FMA(T3z, T3y, T3f * T3K);
Chris@42 1074 T3M = W[4];
Chris@42 1075 T46 = W[5];
Chris@42 1076 T4i = FMA(T3M, T45, T46 * T4h);
Chris@42 1077 T4k = FNMS(T46, T45, T3M * T4h);
Chris@42 1078 }
Chris@42 1079 Rp[WS(rs, 1)] = T3L - T4i;
Chris@42 1080 Ip[WS(rs, 1)] = T4j + T4k;
Chris@42 1081 Rm[WS(rs, 1)] = T3L + T4i;
Chris@42 1082 Im[WS(rs, 1)] = T4k - T4j;
Chris@42 1083 }
Chris@42 1084 {
Chris@42 1085 E T4t, T4z, T4y, T4A;
Chris@42 1086 {
Chris@42 1087 E T4l, T4p, T4u, T4w;
Chris@42 1088 T4l = W[10];
Chris@42 1089 T4p = W[11];
Chris@42 1090 T4t = FNMS(T4p, T4s, T4l * T4o);
Chris@42 1091 T4z = FMA(T4p, T4o, T4l * T4s);
Chris@42 1092 T4u = W[12];
Chris@42 1093 T4w = W[13];
Chris@42 1094 T4y = FMA(T4u, T4v, T4w * T4x);
Chris@42 1095 T4A = FNMS(T4w, T4v, T4u * T4x);
Chris@42 1096 }
Chris@42 1097 Rp[WS(rs, 3)] = T4t - T4y;
Chris@42 1098 Ip[WS(rs, 3)] = T4z + T4A;
Chris@42 1099 Rm[WS(rs, 3)] = T4t + T4y;
Chris@42 1100 Im[WS(rs, 3)] = T4A - T4z;
Chris@42 1101 }
Chris@42 1102 {
Chris@42 1103 E T4V, T51, T50, T52;
Chris@42 1104 {
Chris@42 1105 E T4R, T4T, T4W, T4Y;
Chris@42 1106 T4R = W[34];
Chris@42 1107 T4T = W[35];
Chris@42 1108 T4V = FNMS(T4T, T4U, T4R * T4S);
Chris@42 1109 T51 = FMA(T4T, T4S, T4R * T4U);
Chris@42 1110 T4W = W[36];
Chris@42 1111 T4Y = W[37];
Chris@42 1112 T50 = FMA(T4W, T4X, T4Y * T4Z);
Chris@42 1113 T52 = FNMS(T4Y, T4X, T4W * T4Z);
Chris@42 1114 }
Chris@42 1115 Rp[WS(rs, 9)] = T4V - T50;
Chris@42 1116 Ip[WS(rs, 9)] = T51 + T52;
Chris@42 1117 Rm[WS(rs, 9)] = T4V + T50;
Chris@42 1118 Im[WS(rs, 9)] = T52 - T51;
Chris@42 1119 }
Chris@42 1120 }
Chris@42 1121 }
Chris@42 1122 }
Chris@42 1123 }
Chris@42 1124
Chris@42 1125 static const tw_instr twinstr[] = {
Chris@42 1126 {TW_FULL, 1, 20},
Chris@42 1127 {TW_NEXT, 1, 0}
Chris@42 1128 };
Chris@42 1129
Chris@42 1130 static const hc2c_desc desc = { 20, "hc2cbdft2_20", twinstr, &GENUS, {224, 62, 62, 0} };
Chris@42 1131
Chris@42 1132 void X(codelet_hc2cbdft2_20) (planner *p) {
Chris@42 1133 X(khc2c_register) (p, hc2cbdft2_20, &desc, HC2C_VIA_DFT);
Chris@42 1134 }
Chris@42 1135 #endif /* HAVE_FMA */