annotate src/fftw-3.3.8/rdft/scalar/r2cb/hc2cbdft2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:08:01 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cbdft2_20 -include rdft/scalar/hc2cb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 286 FP additions, 148 FP multiplications,
Chris@82 32 * (or, 176 additions, 38 multiplications, 110 fused multiply/add),
Chris@82 33 * 104 stack variables, 4 constants, and 80 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cb.h"
Chris@82 36
Chris@82 37 static void hc2cbdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@82 46 E T27, T2o, T3T, T41, T2p, T40, T1N, T2Q, T1w, T2L, T4n, T59, T4A, T5e, T24;
Chris@82 47 E T2m, T2h, T2Z, T3P, T4J, T3W, T3Y, T7, TC, T2c, T2d, T3y, T3F, T3G, T3H;
Chris@82 48 E T46, T4d, T4e, T4f, T4r, T4u, T4v, T4w, T1E, T1H, T1I, T1J, TJ, T16, T17;
Chris@82 49 E T18;
Chris@82 50 {
Chris@82 51 E T3, T1A, TI, T25, T6, TF, T1D, T26, Te, T47, T4k, TO, T1e, T3z, T3M;
Chris@82 52 E T1S, Tt, T4a, T4h, TZ, T1p, T3C, T3J, T1Z, TA, T4b, T4i, T14, T1u, T3D;
Chris@82 53 E T3K, T22, Tl, T48, T4l, TT, T1j, T3A, T3N, T1V;
Chris@82 54 {
Chris@82 55 E T1, T2, TG, TH;
Chris@82 56 T1 = Rp[0];
Chris@82 57 T2 = Rm[WS(rs, 9)];
Chris@82 58 T3 = T1 + T2;
Chris@82 59 T1A = T1 - T2;
Chris@82 60 TG = Ip[0];
Chris@82 61 TH = Im[WS(rs, 9)];
Chris@82 62 TI = TG + TH;
Chris@82 63 T25 = TG - TH;
Chris@82 64 }
Chris@82 65 {
Chris@82 66 E T4, T5, T1B, T1C;
Chris@82 67 T4 = Rp[WS(rs, 5)];
Chris@82 68 T5 = Rm[WS(rs, 4)];
Chris@82 69 T6 = T4 + T5;
Chris@82 70 TF = T4 - T5;
Chris@82 71 T1B = Ip[WS(rs, 5)];
Chris@82 72 T1C = Im[WS(rs, 4)];
Chris@82 73 T1D = T1B + T1C;
Chris@82 74 T26 = T1B - T1C;
Chris@82 75 }
Chris@82 76 {
Chris@82 77 E Ta, T1a, TN, T1Q, Td, TK, T1d, T1R;
Chris@82 78 {
Chris@82 79 E T8, T9, TL, TM;
Chris@82 80 T8 = Rp[WS(rs, 4)];
Chris@82 81 T9 = Rm[WS(rs, 5)];
Chris@82 82 Ta = T8 + T9;
Chris@82 83 T1a = T8 - T9;
Chris@82 84 TL = Ip[WS(rs, 4)];
Chris@82 85 TM = Im[WS(rs, 5)];
Chris@82 86 TN = TL + TM;
Chris@82 87 T1Q = TL - TM;
Chris@82 88 }
Chris@82 89 {
Chris@82 90 E Tb, Tc, T1b, T1c;
Chris@82 91 Tb = Rp[WS(rs, 9)];
Chris@82 92 Tc = Rm[0];
Chris@82 93 Td = Tb + Tc;
Chris@82 94 TK = Tb - Tc;
Chris@82 95 T1b = Ip[WS(rs, 9)];
Chris@82 96 T1c = Im[0];
Chris@82 97 T1d = T1b + T1c;
Chris@82 98 T1R = T1b - T1c;
Chris@82 99 }
Chris@82 100 Te = Ta + Td;
Chris@82 101 T47 = TN - TK;
Chris@82 102 T4k = T1a + T1d;
Chris@82 103 TO = TK + TN;
Chris@82 104 T1e = T1a - T1d;
Chris@82 105 T3z = Ta - Td;
Chris@82 106 T3M = T1Q - T1R;
Chris@82 107 T1S = T1Q + T1R;
Chris@82 108 }
Chris@82 109 {
Chris@82 110 E Tp, T1l, TY, T1X, Ts, TV, T1o, T1Y;
Chris@82 111 {
Chris@82 112 E Tn, To, TW, TX;
Chris@82 113 Tn = Rp[WS(rs, 8)];
Chris@82 114 To = Rm[WS(rs, 1)];
Chris@82 115 Tp = Tn + To;
Chris@82 116 T1l = Tn - To;
Chris@82 117 TW = Ip[WS(rs, 8)];
Chris@82 118 TX = Im[WS(rs, 1)];
Chris@82 119 TY = TW + TX;
Chris@82 120 T1X = TW - TX;
Chris@82 121 }
Chris@82 122 {
Chris@82 123 E Tq, Tr, T1m, T1n;
Chris@82 124 Tq = Rm[WS(rs, 6)];
Chris@82 125 Tr = Rp[WS(rs, 3)];
Chris@82 126 Ts = Tq + Tr;
Chris@82 127 TV = Tq - Tr;
Chris@82 128 T1m = Im[WS(rs, 6)];
Chris@82 129 T1n = Ip[WS(rs, 3)];
Chris@82 130 T1o = T1m + T1n;
Chris@82 131 T1Y = T1n - T1m;
Chris@82 132 }
Chris@82 133 Tt = Tp + Ts;
Chris@82 134 T4a = TY - TV;
Chris@82 135 T4h = T1l - T1o;
Chris@82 136 TZ = TV + TY;
Chris@82 137 T1p = T1l + T1o;
Chris@82 138 T3C = Tp - Ts;
Chris@82 139 T3J = T1X - T1Y;
Chris@82 140 T1Z = T1X + T1Y;
Chris@82 141 }
Chris@82 142 {
Chris@82 143 E Tw, T1q, T13, T20, Tz, T10, T1t, T21;
Chris@82 144 {
Chris@82 145 E Tu, Tv, T11, T12;
Chris@82 146 Tu = Rm[WS(rs, 7)];
Chris@82 147 Tv = Rp[WS(rs, 2)];
Chris@82 148 Tw = Tu + Tv;
Chris@82 149 T1q = Tu - Tv;
Chris@82 150 T11 = Im[WS(rs, 7)];
Chris@82 151 T12 = Ip[WS(rs, 2)];
Chris@82 152 T13 = T11 + T12;
Chris@82 153 T20 = T12 - T11;
Chris@82 154 }
Chris@82 155 {
Chris@82 156 E Tx, Ty, T1r, T1s;
Chris@82 157 Tx = Rm[WS(rs, 2)];
Chris@82 158 Ty = Rp[WS(rs, 7)];
Chris@82 159 Tz = Tx + Ty;
Chris@82 160 T10 = Tx - Ty;
Chris@82 161 T1r = Im[WS(rs, 2)];
Chris@82 162 T1s = Ip[WS(rs, 7)];
Chris@82 163 T1t = T1r + T1s;
Chris@82 164 T21 = T1s - T1r;
Chris@82 165 }
Chris@82 166 TA = Tw + Tz;
Chris@82 167 T4b = T10 + T13;
Chris@82 168 T4i = T1q - T1t;
Chris@82 169 T14 = T10 - T13;
Chris@82 170 T1u = T1q + T1t;
Chris@82 171 T3D = Tw - Tz;
Chris@82 172 T3K = T20 - T21;
Chris@82 173 T22 = T20 + T21;
Chris@82 174 }
Chris@82 175 {
Chris@82 176 E Th, T1f, TS, T1T, Tk, TP, T1i, T1U;
Chris@82 177 {
Chris@82 178 E Tf, Tg, TQ, TR;
Chris@82 179 Tf = Rm[WS(rs, 3)];
Chris@82 180 Tg = Rp[WS(rs, 6)];
Chris@82 181 Th = Tf + Tg;
Chris@82 182 T1f = Tf - Tg;
Chris@82 183 TQ = Im[WS(rs, 3)];
Chris@82 184 TR = Ip[WS(rs, 6)];
Chris@82 185 TS = TQ + TR;
Chris@82 186 T1T = TR - TQ;
Chris@82 187 }
Chris@82 188 {
Chris@82 189 E Ti, Tj, T1g, T1h;
Chris@82 190 Ti = Rp[WS(rs, 1)];
Chris@82 191 Tj = Rm[WS(rs, 8)];
Chris@82 192 Tk = Ti + Tj;
Chris@82 193 TP = Ti - Tj;
Chris@82 194 T1g = Ip[WS(rs, 1)];
Chris@82 195 T1h = Im[WS(rs, 8)];
Chris@82 196 T1i = T1g + T1h;
Chris@82 197 T1U = T1g - T1h;
Chris@82 198 }
Chris@82 199 Tl = Th + Tk;
Chris@82 200 T48 = TP + TS;
Chris@82 201 T4l = T1f + T1i;
Chris@82 202 TT = TP - TS;
Chris@82 203 T1j = T1f - T1i;
Chris@82 204 T3A = Th - Tk;
Chris@82 205 T3N = T1T - T1U;
Chris@82 206 T1V = T1T + T1U;
Chris@82 207 }
Chris@82 208 T27 = T25 + T26;
Chris@82 209 T2o = Tt - TA;
Chris@82 210 T3T = T25 - T26;
Chris@82 211 T41 = T3z - T3A;
Chris@82 212 T2p = Te - Tl;
Chris@82 213 {
Chris@82 214 E T1L, T1M, T1k, T1v;
Chris@82 215 T40 = T3C - T3D;
Chris@82 216 T1L = TO - TT;
Chris@82 217 T1M = TZ - T14;
Chris@82 218 T1N = FMA(KP618033988, T1M, T1L);
Chris@82 219 T2Q = FNMS(KP618033988, T1L, T1M);
Chris@82 220 T1k = T1e - T1j;
Chris@82 221 T1v = T1p - T1u;
Chris@82 222 T1w = FMA(KP618033988, T1v, T1k);
Chris@82 223 T2L = FNMS(KP618033988, T1k, T1v);
Chris@82 224 {
Chris@82 225 E T4j, T4m, T4y, T4z;
Chris@82 226 T4j = T4h - T4i;
Chris@82 227 T4m = T4k - T4l;
Chris@82 228 T4n = FNMS(KP618033988, T4m, T4j);
Chris@82 229 T59 = FMA(KP618033988, T4j, T4m);
Chris@82 230 T4y = T4a + T4b;
Chris@82 231 T4z = T47 + T48;
Chris@82 232 T4A = FNMS(KP618033988, T4z, T4y);
Chris@82 233 T5e = FMA(KP618033988, T4y, T4z);
Chris@82 234 }
Chris@82 235 }
Chris@82 236 {
Chris@82 237 E T3L, T3O, T4s, T4t;
Chris@82 238 {
Chris@82 239 E T1W, T23, T2f, T2g;
Chris@82 240 T1W = T1S + T1V;
Chris@82 241 T23 = T1Z + T22;
Chris@82 242 T24 = T1W + T23;
Chris@82 243 T2m = T1W - T23;
Chris@82 244 T2f = T1Z - T22;
Chris@82 245 T2g = T1S - T1V;
Chris@82 246 T2h = FNMS(KP618033988, T2g, T2f);
Chris@82 247 T2Z = FMA(KP618033988, T2f, T2g);
Chris@82 248 }
Chris@82 249 T3L = T3J - T3K;
Chris@82 250 T3O = T3M - T3N;
Chris@82 251 T3P = FNMS(KP618033988, T3O, T3L);
Chris@82 252 T4J = FMA(KP618033988, T3L, T3O);
Chris@82 253 {
Chris@82 254 E T3U, T3V, Tm, TB;
Chris@82 255 T3U = T3M + T3N;
Chris@82 256 T3V = T3J + T3K;
Chris@82 257 T3W = T3U + T3V;
Chris@82 258 T3Y = T3U - T3V;
Chris@82 259 T7 = T3 + T6;
Chris@82 260 Tm = Te + Tl;
Chris@82 261 TB = Tt + TA;
Chris@82 262 TC = Tm + TB;
Chris@82 263 T2c = FNMS(KP250000000, TC, T7);
Chris@82 264 T2d = Tm - TB;
Chris@82 265 }
Chris@82 266 {
Chris@82 267 E T3B, T3E, T49, T4c;
Chris@82 268 T3y = T3 - T6;
Chris@82 269 T3B = T3z + T3A;
Chris@82 270 T3E = T3C + T3D;
Chris@82 271 T3F = T3B + T3E;
Chris@82 272 T3G = FNMS(KP250000000, T3F, T3y);
Chris@82 273 T3H = T3B - T3E;
Chris@82 274 T46 = TI - TF;
Chris@82 275 T49 = T47 - T48;
Chris@82 276 T4c = T4a - T4b;
Chris@82 277 T4d = T49 + T4c;
Chris@82 278 T4e = FNMS(KP250000000, T4d, T46);
Chris@82 279 T4f = T49 - T4c;
Chris@82 280 }
Chris@82 281 T4r = T1A + T1D;
Chris@82 282 T4s = T4k + T4l;
Chris@82 283 T4t = T4h + T4i;
Chris@82 284 T4u = T4s + T4t;
Chris@82 285 T4v = FNMS(KP250000000, T4u, T4r);
Chris@82 286 T4w = T4s - T4t;
Chris@82 287 {
Chris@82 288 E T1F, T1G, TU, T15;
Chris@82 289 T1E = T1A - T1D;
Chris@82 290 T1F = T1e + T1j;
Chris@82 291 T1G = T1p + T1u;
Chris@82 292 T1H = T1F + T1G;
Chris@82 293 T1I = FNMS(KP250000000, T1H, T1E);
Chris@82 294 T1J = T1F - T1G;
Chris@82 295 TJ = TF + TI;
Chris@82 296 TU = TO + TT;
Chris@82 297 T15 = TZ + T14;
Chris@82 298 T16 = TU + T15;
Chris@82 299 T17 = FNMS(KP250000000, T16, TJ);
Chris@82 300 T18 = TU - T15;
Chris@82 301 }
Chris@82 302 }
Chris@82 303 }
Chris@82 304 {
Chris@82 305 E TD, T28, T3o, T3r, T3p, T3v, T2r, T3l, T2H, T35, T2b, T2j, T2k, T2z, T2D;
Chris@82 306 E T2F, T2G, T2T, T2X, T31, T32, T3d, T3h, T3j, T3k, T3t, T1x, T2u, T1O, T2x;
Chris@82 307 E T1y, T29, T2v, T2B, T2M, T38, T2R, T3b, T2N, T2V, T39, T3f, T3n, T1P, T2a;
Chris@82 308 E T1z;
Chris@82 309 TD = T7 + TC;
Chris@82 310 T28 = T24 + T27;
Chris@82 311 T3o = TJ + T16;
Chris@82 312 T3r = T1H + T1E;
Chris@82 313 T3n = W[8];
Chris@82 314 T3p = T3n * T3o;
Chris@82 315 T3v = T3n * T3r;
Chris@82 316 {
Chris@82 317 E T2q, T34, T2n, T33, T2l;
Chris@82 318 T2q = FNMS(KP618033988, T2p, T2o);
Chris@82 319 T34 = FMA(KP618033988, T2o, T2p);
Chris@82 320 T2l = FNMS(KP250000000, T24, T27);
Chris@82 321 T2n = FNMS(KP559016994, T2m, T2l);
Chris@82 322 T33 = FMA(KP559016994, T2m, T2l);
Chris@82 323 T2r = FMA(KP951056516, T2q, T2n);
Chris@82 324 T3l = FNMS(KP951056516, T34, T33);
Chris@82 325 T2H = FNMS(KP951056516, T2q, T2n);
Chris@82 326 T35 = FMA(KP951056516, T34, T33);
Chris@82 327 }
Chris@82 328 {
Chris@82 329 E T2i, T2E, T2e, T30, T3i, T2Y;
Chris@82 330 T2e = FNMS(KP559016994, T2d, T2c);
Chris@82 331 T2i = FNMS(KP951056516, T2h, T2e);
Chris@82 332 T2E = FMA(KP951056516, T2h, T2e);
Chris@82 333 T2b = W[14];
Chris@82 334 T2j = T2b * T2i;
Chris@82 335 T2k = W[15];
Chris@82 336 T2z = T2k * T2i;
Chris@82 337 T2D = W[22];
Chris@82 338 T2F = T2D * T2E;
Chris@82 339 T2G = W[23];
Chris@82 340 T2T = T2G * T2E;
Chris@82 341 T2Y = FMA(KP559016994, T2d, T2c);
Chris@82 342 T30 = FNMS(KP951056516, T2Z, T2Y);
Chris@82 343 T3i = FMA(KP951056516, T2Z, T2Y);
Chris@82 344 T2X = W[30];
Chris@82 345 T31 = T2X * T30;
Chris@82 346 T32 = W[31];
Chris@82 347 T3d = T32 * T30;
Chris@82 348 T3h = W[6];
Chris@82 349 T3j = T3h * T3i;
Chris@82 350 T3k = W[7];
Chris@82 351 T3t = T3k * T3i;
Chris@82 352 }
Chris@82 353 {
Chris@82 354 E T19, T1K, TE, T2t;
Chris@82 355 T19 = FMA(KP559016994, T18, T17);
Chris@82 356 T1x = FMA(KP951056516, T1w, T19);
Chris@82 357 T2u = FNMS(KP951056516, T1w, T19);
Chris@82 358 T1K = FMA(KP559016994, T1J, T1I);
Chris@82 359 T1O = FNMS(KP951056516, T1N, T1K);
Chris@82 360 T2x = FMA(KP951056516, T1N, T1K);
Chris@82 361 TE = W[0];
Chris@82 362 T1y = TE * T1x;
Chris@82 363 T29 = TE * T1O;
Chris@82 364 T2t = W[16];
Chris@82 365 T2v = T2t * T2u;
Chris@82 366 T2B = T2t * T2x;
Chris@82 367 }
Chris@82 368 {
Chris@82 369 E T2K, T2P, T2J, T37;
Chris@82 370 T2K = FNMS(KP559016994, T18, T17);
Chris@82 371 T2M = FMA(KP951056516, T2L, T2K);
Chris@82 372 T38 = FNMS(KP951056516, T2L, T2K);
Chris@82 373 T2P = FNMS(KP559016994, T1J, T1I);
Chris@82 374 T2R = FNMS(KP951056516, T2Q, T2P);
Chris@82 375 T3b = FMA(KP951056516, T2Q, T2P);
Chris@82 376 T2J = W[24];
Chris@82 377 T2N = T2J * T2M;
Chris@82 378 T2V = T2J * T2R;
Chris@82 379 T37 = W[32];
Chris@82 380 T39 = T37 * T38;
Chris@82 381 T3f = T37 * T3b;
Chris@82 382 }
Chris@82 383 T1z = W[1];
Chris@82 384 T1P = FMA(T1z, T1O, T1y);
Chris@82 385 T2a = FNMS(T1z, T1x, T29);
Chris@82 386 Rp[0] = TD - T1P;
Chris@82 387 Ip[0] = T28 + T2a;
Chris@82 388 Rm[0] = TD + T1P;
Chris@82 389 Im[0] = T2a - T28;
Chris@82 390 {
Chris@82 391 E T3m, T3u, T3s, T3w, T3q;
Chris@82 392 T3m = FNMS(T3k, T3l, T3j);
Chris@82 393 T3u = FMA(T3h, T3l, T3t);
Chris@82 394 T3q = W[9];
Chris@82 395 T3s = FMA(T3q, T3r, T3p);
Chris@82 396 T3w = FNMS(T3q, T3o, T3v);
Chris@82 397 Rp[WS(rs, 2)] = T3m - T3s;
Chris@82 398 Ip[WS(rs, 2)] = T3u + T3w;
Chris@82 399 Rm[WS(rs, 2)] = T3m + T3s;
Chris@82 400 Im[WS(rs, 2)] = T3w - T3u;
Chris@82 401 }
Chris@82 402 {
Chris@82 403 E T2s, T2A, T2y, T2C, T2w;
Chris@82 404 T2s = FNMS(T2k, T2r, T2j);
Chris@82 405 T2A = FMA(T2b, T2r, T2z);
Chris@82 406 T2w = W[17];
Chris@82 407 T2y = FMA(T2w, T2x, T2v);
Chris@82 408 T2C = FNMS(T2w, T2u, T2B);
Chris@82 409 Rp[WS(rs, 4)] = T2s - T2y;
Chris@82 410 Ip[WS(rs, 4)] = T2A + T2C;
Chris@82 411 Rm[WS(rs, 4)] = T2s + T2y;
Chris@82 412 Im[WS(rs, 4)] = T2C - T2A;
Chris@82 413 }
Chris@82 414 {
Chris@82 415 E T2I, T2U, T2S, T2W, T2O;
Chris@82 416 T2I = FNMS(T2G, T2H, T2F);
Chris@82 417 T2U = FMA(T2D, T2H, T2T);
Chris@82 418 T2O = W[25];
Chris@82 419 T2S = FMA(T2O, T2R, T2N);
Chris@82 420 T2W = FNMS(T2O, T2M, T2V);
Chris@82 421 Rp[WS(rs, 6)] = T2I - T2S;
Chris@82 422 Ip[WS(rs, 6)] = T2U + T2W;
Chris@82 423 Rm[WS(rs, 6)] = T2I + T2S;
Chris@82 424 Im[WS(rs, 6)] = T2W - T2U;
Chris@82 425 }
Chris@82 426 {
Chris@82 427 E T36, T3e, T3c, T3g, T3a;
Chris@82 428 T36 = FNMS(T32, T35, T31);
Chris@82 429 T3e = FMA(T2X, T35, T3d);
Chris@82 430 T3a = W[33];
Chris@82 431 T3c = FMA(T3a, T3b, T39);
Chris@82 432 T3g = FNMS(T3a, T38, T3f);
Chris@82 433 Rp[WS(rs, 8)] = T36 - T3c;
Chris@82 434 Ip[WS(rs, 8)] = T3e + T3g;
Chris@82 435 Rm[WS(rs, 8)] = T36 + T3c;
Chris@82 436 Im[WS(rs, 8)] = T3g - T3e;
Chris@82 437 }
Chris@82 438 }
Chris@82 439 {
Chris@82 440 E T55, T51, T53, T54, T5h, T5I, T5L, T5J, T5P, T43, T5F, T4P, T5p, T3x, T3R;
Chris@82 441 E T3S, T4D, T5l, T5n, T5o, T5x, T4H, T4L, T4M, T4X, T5B, T5D, T5E, T5N, T4o;
Chris@82 442 E T4S, T4B, T4V, T4p, T4F, T4T, T4Z, T5a, T5s, T5f, T5v, T5b, T5j, T5t, T5z;
Chris@82 443 E T52, T5H;
Chris@82 444 T55 = T3W + T3T;
Chris@82 445 T52 = T3y + T3F;
Chris@82 446 T51 = W[18];
Chris@82 447 T53 = T51 * T52;
Chris@82 448 T54 = W[19];
Chris@82 449 T5h = T54 * T52;
Chris@82 450 T5I = T46 + T4d;
Chris@82 451 T5L = T4u + T4r;
Chris@82 452 T5H = W[28];
Chris@82 453 T5J = T5H * T5I;
Chris@82 454 T5P = T5H * T5L;
Chris@82 455 {
Chris@82 456 E T42, T4O, T3Z, T4N, T3X;
Chris@82 457 T42 = FNMS(KP618033988, T41, T40);
Chris@82 458 T4O = FMA(KP618033988, T40, T41);
Chris@82 459 T3X = FNMS(KP250000000, T3W, T3T);
Chris@82 460 T3Z = FNMS(KP559016994, T3Y, T3X);
Chris@82 461 T4N = FMA(KP559016994, T3Y, T3X);
Chris@82 462 T43 = FNMS(KP951056516, T42, T3Z);
Chris@82 463 T5F = FNMS(KP951056516, T4O, T4N);
Chris@82 464 T4P = FMA(KP951056516, T4O, T4N);
Chris@82 465 T5p = FMA(KP951056516, T42, T3Z);
Chris@82 466 }
Chris@82 467 {
Chris@82 468 E T3Q, T5m, T3I, T4K, T5C, T4I;
Chris@82 469 T3I = FNMS(KP559016994, T3H, T3G);
Chris@82 470 T3Q = FMA(KP951056516, T3P, T3I);
Chris@82 471 T5m = FNMS(KP951056516, T3P, T3I);
Chris@82 472 T3x = W[2];
Chris@82 473 T3R = T3x * T3Q;
Chris@82 474 T3S = W[3];
Chris@82 475 T4D = T3S * T3Q;
Chris@82 476 T5l = W[34];
Chris@82 477 T5n = T5l * T5m;
Chris@82 478 T5o = W[35];
Chris@82 479 T5x = T5o * T5m;
Chris@82 480 T4I = FMA(KP559016994, T3H, T3G);
Chris@82 481 T4K = FNMS(KP951056516, T4J, T4I);
Chris@82 482 T5C = FMA(KP951056516, T4J, T4I);
Chris@82 483 T4H = W[10];
Chris@82 484 T4L = T4H * T4K;
Chris@82 485 T4M = W[11];
Chris@82 486 T4X = T4M * T4K;
Chris@82 487 T5B = W[26];
Chris@82 488 T5D = T5B * T5C;
Chris@82 489 T5E = W[27];
Chris@82 490 T5N = T5E * T5C;
Chris@82 491 }
Chris@82 492 {
Chris@82 493 E T4g, T4x, T45, T4R;
Chris@82 494 T4g = FNMS(KP559016994, T4f, T4e);
Chris@82 495 T4o = FMA(KP951056516, T4n, T4g);
Chris@82 496 T4S = FNMS(KP951056516, T4n, T4g);
Chris@82 497 T4x = FNMS(KP559016994, T4w, T4v);
Chris@82 498 T4B = FNMS(KP951056516, T4A, T4x);
Chris@82 499 T4V = FMA(KP951056516, T4A, T4x);
Chris@82 500 T45 = W[4];
Chris@82 501 T4p = T45 * T4o;
Chris@82 502 T4F = T45 * T4B;
Chris@82 503 T4R = W[12];
Chris@82 504 T4T = T4R * T4S;
Chris@82 505 T4Z = T4R * T4V;
Chris@82 506 }
Chris@82 507 {
Chris@82 508 E T58, T5d, T57, T5r;
Chris@82 509 T58 = FMA(KP559016994, T4f, T4e);
Chris@82 510 T5a = FMA(KP951056516, T59, T58);
Chris@82 511 T5s = FNMS(KP951056516, T59, T58);
Chris@82 512 T5d = FMA(KP559016994, T4w, T4v);
Chris@82 513 T5f = FNMS(KP951056516, T5e, T5d);
Chris@82 514 T5v = FMA(KP951056516, T5e, T5d);
Chris@82 515 T57 = W[20];
Chris@82 516 T5b = T57 * T5a;
Chris@82 517 T5j = T57 * T5f;
Chris@82 518 T5r = W[36];
Chris@82 519 T5t = T5r * T5s;
Chris@82 520 T5z = T5r * T5v;
Chris@82 521 }
Chris@82 522 {
Chris@82 523 E T44, T4E, T4C, T4G, T4q;
Chris@82 524 T44 = FNMS(T3S, T43, T3R);
Chris@82 525 T4E = FMA(T3x, T43, T4D);
Chris@82 526 T4q = W[5];
Chris@82 527 T4C = FMA(T4q, T4B, T4p);
Chris@82 528 T4G = FNMS(T4q, T4o, T4F);
Chris@82 529 Rp[WS(rs, 1)] = T44 - T4C;
Chris@82 530 Ip[WS(rs, 1)] = T4E + T4G;
Chris@82 531 Rm[WS(rs, 1)] = T44 + T4C;
Chris@82 532 Im[WS(rs, 1)] = T4G - T4E;
Chris@82 533 }
Chris@82 534 {
Chris@82 535 E T5G, T5O, T5M, T5Q, T5K;
Chris@82 536 T5G = FNMS(T5E, T5F, T5D);
Chris@82 537 T5O = FMA(T5B, T5F, T5N);
Chris@82 538 T5K = W[29];
Chris@82 539 T5M = FMA(T5K, T5L, T5J);
Chris@82 540 T5Q = FNMS(T5K, T5I, T5P);
Chris@82 541 Rp[WS(rs, 7)] = T5G - T5M;
Chris@82 542 Ip[WS(rs, 7)] = T5O + T5Q;
Chris@82 543 Rm[WS(rs, 7)] = T5G + T5M;
Chris@82 544 Im[WS(rs, 7)] = T5Q - T5O;
Chris@82 545 }
Chris@82 546 {
Chris@82 547 E T4Q, T4Y, T4W, T50, T4U;
Chris@82 548 T4Q = FNMS(T4M, T4P, T4L);
Chris@82 549 T4Y = FMA(T4H, T4P, T4X);
Chris@82 550 T4U = W[13];
Chris@82 551 T4W = FMA(T4U, T4V, T4T);
Chris@82 552 T50 = FNMS(T4U, T4S, T4Z);
Chris@82 553 Rp[WS(rs, 3)] = T4Q - T4W;
Chris@82 554 Ip[WS(rs, 3)] = T4Y + T50;
Chris@82 555 Rm[WS(rs, 3)] = T4Q + T4W;
Chris@82 556 Im[WS(rs, 3)] = T50 - T4Y;
Chris@82 557 }
Chris@82 558 {
Chris@82 559 E T56, T5i, T5g, T5k, T5c;
Chris@82 560 T56 = FNMS(T54, T55, T53);
Chris@82 561 T5i = FMA(T51, T55, T5h);
Chris@82 562 T5c = W[21];
Chris@82 563 T5g = FMA(T5c, T5f, T5b);
Chris@82 564 T5k = FNMS(T5c, T5a, T5j);
Chris@82 565 Rp[WS(rs, 5)] = T56 - T5g;
Chris@82 566 Ip[WS(rs, 5)] = T5i + T5k;
Chris@82 567 Rm[WS(rs, 5)] = T56 + T5g;
Chris@82 568 Im[WS(rs, 5)] = T5k - T5i;
Chris@82 569 }
Chris@82 570 {
Chris@82 571 E T5q, T5y, T5w, T5A, T5u;
Chris@82 572 T5q = FNMS(T5o, T5p, T5n);
Chris@82 573 T5y = FMA(T5l, T5p, T5x);
Chris@82 574 T5u = W[37];
Chris@82 575 T5w = FMA(T5u, T5v, T5t);
Chris@82 576 T5A = FNMS(T5u, T5s, T5z);
Chris@82 577 Rp[WS(rs, 9)] = T5q - T5w;
Chris@82 578 Ip[WS(rs, 9)] = T5y + T5A;
Chris@82 579 Rm[WS(rs, 9)] = T5q + T5w;
Chris@82 580 Im[WS(rs, 9)] = T5A - T5y;
Chris@82 581 }
Chris@82 582 }
Chris@82 583 }
Chris@82 584 }
Chris@82 585 }
Chris@82 586
Chris@82 587 static const tw_instr twinstr[] = {
Chris@82 588 {TW_FULL, 1, 20},
Chris@82 589 {TW_NEXT, 1, 0}
Chris@82 590 };
Chris@82 591
Chris@82 592 static const hc2c_desc desc = { 20, "hc2cbdft2_20", twinstr, &GENUS, {176, 38, 110, 0} };
Chris@82 593
Chris@82 594 void X(codelet_hc2cbdft2_20) (planner *p) {
Chris@82 595 X(khc2c_register) (p, hc2cbdft2_20, &desc, HC2C_VIA_DFT);
Chris@82 596 }
Chris@82 597 #else
Chris@82 598
Chris@82 599 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cbdft2_20 -include rdft/scalar/hc2cb.h */
Chris@82 600
Chris@82 601 /*
Chris@82 602 * This function contains 286 FP additions, 124 FP multiplications,
Chris@82 603 * (or, 224 additions, 62 multiplications, 62 fused multiply/add),
Chris@82 604 * 89 stack variables, 4 constants, and 80 memory accesses
Chris@82 605 */
Chris@82 606 #include "rdft/scalar/hc2cb.h"
Chris@82 607
Chris@82 608 static void hc2cbdft2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 609 {
Chris@82 610 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 611 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 612 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 613 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 614 {
Chris@82 615 INT m;
Chris@82 616 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@82 617 E T7, T3N, T4a, T16, T1G, T3g, T3D, T26, T1k, T3A, T3B, T1v, T2e, T48, T47;
Chris@82 618 E T2d, T1L, T43, T40, T1K, T2l, T3t, T2m, T3w, T3n, T3p, TC, T2b, T4d, T4f;
Chris@82 619 E T23, T2j, T1B, T1H, T3U, T3W, T3G, T3I, T11, T17;
Chris@82 620 {
Chris@82 621 E T3, T1C, T15, T24, T6, T12, T1F, T25;
Chris@82 622 {
Chris@82 623 E T1, T2, T13, T14;
Chris@82 624 T1 = Rp[0];
Chris@82 625 T2 = Rm[WS(rs, 9)];
Chris@82 626 T3 = T1 + T2;
Chris@82 627 T1C = T1 - T2;
Chris@82 628 T13 = Ip[0];
Chris@82 629 T14 = Im[WS(rs, 9)];
Chris@82 630 T15 = T13 + T14;
Chris@82 631 T24 = T13 - T14;
Chris@82 632 }
Chris@82 633 {
Chris@82 634 E T4, T5, T1D, T1E;
Chris@82 635 T4 = Rp[WS(rs, 5)];
Chris@82 636 T5 = Rm[WS(rs, 4)];
Chris@82 637 T6 = T4 + T5;
Chris@82 638 T12 = T4 - T5;
Chris@82 639 T1D = Ip[WS(rs, 5)];
Chris@82 640 T1E = Im[WS(rs, 4)];
Chris@82 641 T1F = T1D + T1E;
Chris@82 642 T25 = T1D - T1E;
Chris@82 643 }
Chris@82 644 T7 = T3 + T6;
Chris@82 645 T3N = T15 - T12;
Chris@82 646 T4a = T1C + T1F;
Chris@82 647 T16 = T12 + T15;
Chris@82 648 T1G = T1C - T1F;
Chris@82 649 T3g = T3 - T6;
Chris@82 650 T3D = T24 - T25;
Chris@82 651 T26 = T24 + T25;
Chris@82 652 }
Chris@82 653 {
Chris@82 654 E Te, T3O, T3Y, TJ, T1e, T3h, T3r, T1R, TA, T3S, T42, TZ, T1u, T3l, T3v;
Chris@82 655 E T21, Tl, T3P, T3Z, TO, T1j, T3i, T3s, T1U, Tt, T3R, T41, TU, T1p, T3k;
Chris@82 656 E T3u, T1Y;
Chris@82 657 {
Chris@82 658 E Ta, T1a, TI, T1P, Td, TF, T1d, T1Q;
Chris@82 659 {
Chris@82 660 E T8, T9, TG, TH;
Chris@82 661 T8 = Rp[WS(rs, 4)];
Chris@82 662 T9 = Rm[WS(rs, 5)];
Chris@82 663 Ta = T8 + T9;
Chris@82 664 T1a = T8 - T9;
Chris@82 665 TG = Ip[WS(rs, 4)];
Chris@82 666 TH = Im[WS(rs, 5)];
Chris@82 667 TI = TG + TH;
Chris@82 668 T1P = TG - TH;
Chris@82 669 }
Chris@82 670 {
Chris@82 671 E Tb, Tc, T1b, T1c;
Chris@82 672 Tb = Rp[WS(rs, 9)];
Chris@82 673 Tc = Rm[0];
Chris@82 674 Td = Tb + Tc;
Chris@82 675 TF = Tb - Tc;
Chris@82 676 T1b = Ip[WS(rs, 9)];
Chris@82 677 T1c = Im[0];
Chris@82 678 T1d = T1b + T1c;
Chris@82 679 T1Q = T1b - T1c;
Chris@82 680 }
Chris@82 681 Te = Ta + Td;
Chris@82 682 T3O = TI - TF;
Chris@82 683 T3Y = T1a + T1d;
Chris@82 684 TJ = TF + TI;
Chris@82 685 T1e = T1a - T1d;
Chris@82 686 T3h = Ta - Td;
Chris@82 687 T3r = T1P - T1Q;
Chris@82 688 T1R = T1P + T1Q;
Chris@82 689 }
Chris@82 690 {
Chris@82 691 E Tw, T1q, TY, T1Z, Tz, TV, T1t, T20;
Chris@82 692 {
Chris@82 693 E Tu, Tv, TW, TX;
Chris@82 694 Tu = Rm[WS(rs, 7)];
Chris@82 695 Tv = Rp[WS(rs, 2)];
Chris@82 696 Tw = Tu + Tv;
Chris@82 697 T1q = Tu - Tv;
Chris@82 698 TW = Im[WS(rs, 7)];
Chris@82 699 TX = Ip[WS(rs, 2)];
Chris@82 700 TY = TW + TX;
Chris@82 701 T1Z = TX - TW;
Chris@82 702 }
Chris@82 703 {
Chris@82 704 E Tx, Ty, T1r, T1s;
Chris@82 705 Tx = Rm[WS(rs, 2)];
Chris@82 706 Ty = Rp[WS(rs, 7)];
Chris@82 707 Tz = Tx + Ty;
Chris@82 708 TV = Tx - Ty;
Chris@82 709 T1r = Im[WS(rs, 2)];
Chris@82 710 T1s = Ip[WS(rs, 7)];
Chris@82 711 T1t = T1r + T1s;
Chris@82 712 T20 = T1s - T1r;
Chris@82 713 }
Chris@82 714 TA = Tw + Tz;
Chris@82 715 T3S = TV + TY;
Chris@82 716 T42 = T1q - T1t;
Chris@82 717 TZ = TV - TY;
Chris@82 718 T1u = T1q + T1t;
Chris@82 719 T3l = Tw - Tz;
Chris@82 720 T3v = T1Z - T20;
Chris@82 721 T21 = T1Z + T20;
Chris@82 722 }
Chris@82 723 {
Chris@82 724 E Th, T1f, TN, T1S, Tk, TK, T1i, T1T;
Chris@82 725 {
Chris@82 726 E Tf, Tg, TL, TM;
Chris@82 727 Tf = Rm[WS(rs, 3)];
Chris@82 728 Tg = Rp[WS(rs, 6)];
Chris@82 729 Th = Tf + Tg;
Chris@82 730 T1f = Tf - Tg;
Chris@82 731 TL = Im[WS(rs, 3)];
Chris@82 732 TM = Ip[WS(rs, 6)];
Chris@82 733 TN = TL + TM;
Chris@82 734 T1S = TM - TL;
Chris@82 735 }
Chris@82 736 {
Chris@82 737 E Ti, Tj, T1g, T1h;
Chris@82 738 Ti = Rp[WS(rs, 1)];
Chris@82 739 Tj = Rm[WS(rs, 8)];
Chris@82 740 Tk = Ti + Tj;
Chris@82 741 TK = Ti - Tj;
Chris@82 742 T1g = Ip[WS(rs, 1)];
Chris@82 743 T1h = Im[WS(rs, 8)];
Chris@82 744 T1i = T1g + T1h;
Chris@82 745 T1T = T1g - T1h;
Chris@82 746 }
Chris@82 747 Tl = Th + Tk;
Chris@82 748 T3P = TK + TN;
Chris@82 749 T3Z = T1f + T1i;
Chris@82 750 TO = TK - TN;
Chris@82 751 T1j = T1f - T1i;
Chris@82 752 T3i = Th - Tk;
Chris@82 753 T3s = T1S - T1T;
Chris@82 754 T1U = T1S + T1T;
Chris@82 755 }
Chris@82 756 {
Chris@82 757 E Tp, T1l, TT, T1W, Ts, TQ, T1o, T1X;
Chris@82 758 {
Chris@82 759 E Tn, To, TR, TS;
Chris@82 760 Tn = Rp[WS(rs, 8)];
Chris@82 761 To = Rm[WS(rs, 1)];
Chris@82 762 Tp = Tn + To;
Chris@82 763 T1l = Tn - To;
Chris@82 764 TR = Ip[WS(rs, 8)];
Chris@82 765 TS = Im[WS(rs, 1)];
Chris@82 766 TT = TR + TS;
Chris@82 767 T1W = TR - TS;
Chris@82 768 }
Chris@82 769 {
Chris@82 770 E Tq, Tr, T1m, T1n;
Chris@82 771 Tq = Rm[WS(rs, 6)];
Chris@82 772 Tr = Rp[WS(rs, 3)];
Chris@82 773 Ts = Tq + Tr;
Chris@82 774 TQ = Tq - Tr;
Chris@82 775 T1m = Im[WS(rs, 6)];
Chris@82 776 T1n = Ip[WS(rs, 3)];
Chris@82 777 T1o = T1m + T1n;
Chris@82 778 T1X = T1n - T1m;
Chris@82 779 }
Chris@82 780 Tt = Tp + Ts;
Chris@82 781 T3R = TT - TQ;
Chris@82 782 T41 = T1l - T1o;
Chris@82 783 TU = TQ + TT;
Chris@82 784 T1p = T1l + T1o;
Chris@82 785 T3k = Tp - Ts;
Chris@82 786 T3u = T1W - T1X;
Chris@82 787 T1Y = T1W + T1X;
Chris@82 788 }
Chris@82 789 T1k = T1e - T1j;
Chris@82 790 T3A = T3h - T3i;
Chris@82 791 T3B = T3k - T3l;
Chris@82 792 T1v = T1p - T1u;
Chris@82 793 T2e = T1Y - T21;
Chris@82 794 T48 = T3R + T3S;
Chris@82 795 T47 = T3O + T3P;
Chris@82 796 T2d = T1R - T1U;
Chris@82 797 T1L = TU - TZ;
Chris@82 798 T43 = T41 - T42;
Chris@82 799 T40 = T3Y - T3Z;
Chris@82 800 T1K = TJ - TO;
Chris@82 801 T2l = Te - Tl;
Chris@82 802 T3t = T3r - T3s;
Chris@82 803 T2m = Tt - TA;
Chris@82 804 T3w = T3u - T3v;
Chris@82 805 {
Chris@82 806 E T3j, T3m, Tm, TB;
Chris@82 807 T3j = T3h + T3i;
Chris@82 808 T3m = T3k + T3l;
Chris@82 809 T3n = T3j + T3m;
Chris@82 810 T3p = KP559016994 * (T3j - T3m);
Chris@82 811 Tm = Te + Tl;
Chris@82 812 TB = Tt + TA;
Chris@82 813 TC = Tm + TB;
Chris@82 814 T2b = KP559016994 * (Tm - TB);
Chris@82 815 }
Chris@82 816 {
Chris@82 817 E T4b, T4c, T3Q, T3T;
Chris@82 818 T4b = T3Y + T3Z;
Chris@82 819 T4c = T41 + T42;
Chris@82 820 T4d = T4b + T4c;
Chris@82 821 T4f = KP559016994 * (T4b - T4c);
Chris@82 822 {
Chris@82 823 E T1V, T22, T1z, T1A;
Chris@82 824 T1V = T1R + T1U;
Chris@82 825 T22 = T1Y + T21;
Chris@82 826 T23 = T1V + T22;
Chris@82 827 T2j = KP559016994 * (T1V - T22);
Chris@82 828 T1z = T1e + T1j;
Chris@82 829 T1A = T1p + T1u;
Chris@82 830 T1B = KP559016994 * (T1z - T1A);
Chris@82 831 T1H = T1z + T1A;
Chris@82 832 }
Chris@82 833 T3Q = T3O - T3P;
Chris@82 834 T3T = T3R - T3S;
Chris@82 835 T3U = T3Q + T3T;
Chris@82 836 T3W = KP559016994 * (T3Q - T3T);
Chris@82 837 {
Chris@82 838 E T3E, T3F, TP, T10;
Chris@82 839 T3E = T3r + T3s;
Chris@82 840 T3F = T3u + T3v;
Chris@82 841 T3G = T3E + T3F;
Chris@82 842 T3I = KP559016994 * (T3E - T3F);
Chris@82 843 TP = TJ + TO;
Chris@82 844 T10 = TU + TZ;
Chris@82 845 T11 = KP559016994 * (TP - T10);
Chris@82 846 T17 = TP + T10;
Chris@82 847 }
Chris@82 848 }
Chris@82 849 }
Chris@82 850 {
Chris@82 851 E TD, T27, T3c, T3e, T2o, T36, T2A, T2U, T1N, T2Z, T2t, T2J, T1x, T2X, T2r;
Chris@82 852 E T2F, T2g, T34, T2y, T2Q;
Chris@82 853 TD = T7 + TC;
Chris@82 854 T27 = T23 + T26;
Chris@82 855 {
Chris@82 856 E T39, T3b, T38, T3a;
Chris@82 857 T39 = T16 + T17;
Chris@82 858 T3b = T1H + T1G;
Chris@82 859 T38 = W[8];
Chris@82 860 T3a = W[9];
Chris@82 861 T3c = FMA(T38, T39, T3a * T3b);
Chris@82 862 T3e = FNMS(T3a, T39, T38 * T3b);
Chris@82 863 }
Chris@82 864 {
Chris@82 865 E T2n, T2S, T2k, T2T, T2i;
Chris@82 866 T2n = FNMS(KP951056516, T2m, KP587785252 * T2l);
Chris@82 867 T2S = FMA(KP951056516, T2l, KP587785252 * T2m);
Chris@82 868 T2i = FNMS(KP250000000, T23, T26);
Chris@82 869 T2k = T2i - T2j;
Chris@82 870 T2T = T2j + T2i;
Chris@82 871 T2o = T2k - T2n;
Chris@82 872 T36 = T2T - T2S;
Chris@82 873 T2A = T2n + T2k;
Chris@82 874 T2U = T2S + T2T;
Chris@82 875 }
Chris@82 876 {
Chris@82 877 E T1M, T2H, T1J, T2I, T1I;
Chris@82 878 T1M = FMA(KP951056516, T1K, KP587785252 * T1L);
Chris@82 879 T2H = FNMS(KP951056516, T1L, KP587785252 * T1K);
Chris@82 880 T1I = FNMS(KP250000000, T1H, T1G);
Chris@82 881 T1J = T1B + T1I;
Chris@82 882 T2I = T1I - T1B;
Chris@82 883 T1N = T1J - T1M;
Chris@82 884 T2Z = T2I - T2H;
Chris@82 885 T2t = T1M + T1J;
Chris@82 886 T2J = T2H + T2I;
Chris@82 887 }
Chris@82 888 {
Chris@82 889 E T1w, T2E, T19, T2D, T18;
Chris@82 890 T1w = FMA(KP951056516, T1k, KP587785252 * T1v);
Chris@82 891 T2E = FNMS(KP951056516, T1v, KP587785252 * T1k);
Chris@82 892 T18 = FNMS(KP250000000, T17, T16);
Chris@82 893 T19 = T11 + T18;
Chris@82 894 T2D = T18 - T11;
Chris@82 895 T1x = T19 + T1w;
Chris@82 896 T2X = T2D + T2E;
Chris@82 897 T2r = T19 - T1w;
Chris@82 898 T2F = T2D - T2E;
Chris@82 899 }
Chris@82 900 {
Chris@82 901 E T2f, T2P, T2c, T2O, T2a;
Chris@82 902 T2f = FNMS(KP951056516, T2e, KP587785252 * T2d);
Chris@82 903 T2P = FMA(KP951056516, T2d, KP587785252 * T2e);
Chris@82 904 T2a = FNMS(KP250000000, TC, T7);
Chris@82 905 T2c = T2a - T2b;
Chris@82 906 T2O = T2b + T2a;
Chris@82 907 T2g = T2c + T2f;
Chris@82 908 T34 = T2O + T2P;
Chris@82 909 T2y = T2c - T2f;
Chris@82 910 T2Q = T2O - T2P;
Chris@82 911 }
Chris@82 912 {
Chris@82 913 E T1O, T28, TE, T1y;
Chris@82 914 TE = W[0];
Chris@82 915 T1y = W[1];
Chris@82 916 T1O = FMA(TE, T1x, T1y * T1N);
Chris@82 917 T28 = FNMS(T1y, T1x, TE * T1N);
Chris@82 918 Rp[0] = TD - T1O;
Chris@82 919 Ip[0] = T27 + T28;
Chris@82 920 Rm[0] = TD + T1O;
Chris@82 921 Im[0] = T28 - T27;
Chris@82 922 }
Chris@82 923 {
Chris@82 924 E T37, T3d, T33, T35;
Chris@82 925 T33 = W[6];
Chris@82 926 T35 = W[7];
Chris@82 927 T37 = FNMS(T35, T36, T33 * T34);
Chris@82 928 T3d = FMA(T35, T34, T33 * T36);
Chris@82 929 Rp[WS(rs, 2)] = T37 - T3c;
Chris@82 930 Ip[WS(rs, 2)] = T3d + T3e;
Chris@82 931 Rm[WS(rs, 2)] = T37 + T3c;
Chris@82 932 Im[WS(rs, 2)] = T3e - T3d;
Chris@82 933 }
Chris@82 934 {
Chris@82 935 E T2p, T2v, T2u, T2w;
Chris@82 936 {
Chris@82 937 E T29, T2h, T2q, T2s;
Chris@82 938 T29 = W[14];
Chris@82 939 T2h = W[15];
Chris@82 940 T2p = FNMS(T2h, T2o, T29 * T2g);
Chris@82 941 T2v = FMA(T2h, T2g, T29 * T2o);
Chris@82 942 T2q = W[16];
Chris@82 943 T2s = W[17];
Chris@82 944 T2u = FMA(T2q, T2r, T2s * T2t);
Chris@82 945 T2w = FNMS(T2s, T2r, T2q * T2t);
Chris@82 946 }
Chris@82 947 Rp[WS(rs, 4)] = T2p - T2u;
Chris@82 948 Ip[WS(rs, 4)] = T2v + T2w;
Chris@82 949 Rm[WS(rs, 4)] = T2p + T2u;
Chris@82 950 Im[WS(rs, 4)] = T2w - T2v;
Chris@82 951 }
Chris@82 952 {
Chris@82 953 E T2B, T2L, T2K, T2M;
Chris@82 954 {
Chris@82 955 E T2x, T2z, T2C, T2G;
Chris@82 956 T2x = W[22];
Chris@82 957 T2z = W[23];
Chris@82 958 T2B = FNMS(T2z, T2A, T2x * T2y);
Chris@82 959 T2L = FMA(T2z, T2y, T2x * T2A);
Chris@82 960 T2C = W[24];
Chris@82 961 T2G = W[25];
Chris@82 962 T2K = FMA(T2C, T2F, T2G * T2J);
Chris@82 963 T2M = FNMS(T2G, T2F, T2C * T2J);
Chris@82 964 }
Chris@82 965 Rp[WS(rs, 6)] = T2B - T2K;
Chris@82 966 Ip[WS(rs, 6)] = T2L + T2M;
Chris@82 967 Rm[WS(rs, 6)] = T2B + T2K;
Chris@82 968 Im[WS(rs, 6)] = T2M - T2L;
Chris@82 969 }
Chris@82 970 {
Chris@82 971 E T2V, T31, T30, T32;
Chris@82 972 {
Chris@82 973 E T2N, T2R, T2W, T2Y;
Chris@82 974 T2N = W[30];
Chris@82 975 T2R = W[31];
Chris@82 976 T2V = FNMS(T2R, T2U, T2N * T2Q);
Chris@82 977 T31 = FMA(T2R, T2Q, T2N * T2U);
Chris@82 978 T2W = W[32];
Chris@82 979 T2Y = W[33];
Chris@82 980 T30 = FMA(T2W, T2X, T2Y * T2Z);
Chris@82 981 T32 = FNMS(T2Y, T2X, T2W * T2Z);
Chris@82 982 }
Chris@82 983 Rp[WS(rs, 8)] = T2V - T30;
Chris@82 984 Ip[WS(rs, 8)] = T31 + T32;
Chris@82 985 Rm[WS(rs, 8)] = T2V + T30;
Chris@82 986 Im[WS(rs, 8)] = T32 - T31;
Chris@82 987 }
Chris@82 988 }
Chris@82 989 {
Chris@82 990 E T4F, T4P, T5c, T5e, T3y, T54, T4o, T4S, T4h, T4Z, T4x, T4N, T45, T4X, T4v;
Chris@82 991 E T4J, T3K, T56, T4s, T4U;
Chris@82 992 {
Chris@82 993 E T4C, T4E, T4B, T4D;
Chris@82 994 T4C = T3g + T3n;
Chris@82 995 T4E = T3G + T3D;
Chris@82 996 T4B = W[18];
Chris@82 997 T4D = W[19];
Chris@82 998 T4F = FNMS(T4D, T4E, T4B * T4C);
Chris@82 999 T4P = FMA(T4D, T4C, T4B * T4E);
Chris@82 1000 }
Chris@82 1001 {
Chris@82 1002 E T59, T5b, T58, T5a;
Chris@82 1003 T59 = T3N + T3U;
Chris@82 1004 T5b = T4d + T4a;
Chris@82 1005 T58 = W[28];
Chris@82 1006 T5a = W[29];
Chris@82 1007 T5c = FMA(T58, T59, T5a * T5b);
Chris@82 1008 T5e = FNMS(T5a, T59, T58 * T5b);
Chris@82 1009 }
Chris@82 1010 {
Chris@82 1011 E T3x, T4n, T3q, T4m, T3o;
Chris@82 1012 T3x = FNMS(KP951056516, T3w, KP587785252 * T3t);
Chris@82 1013 T4n = FMA(KP951056516, T3t, KP587785252 * T3w);
Chris@82 1014 T3o = FNMS(KP250000000, T3n, T3g);
Chris@82 1015 T3q = T3o - T3p;
Chris@82 1016 T4m = T3p + T3o;
Chris@82 1017 T3y = T3q - T3x;
Chris@82 1018 T54 = T4m + T4n;
Chris@82 1019 T4o = T4m - T4n;
Chris@82 1020 T4S = T3q + T3x;
Chris@82 1021 }
Chris@82 1022 {
Chris@82 1023 E T49, T4M, T4g, T4L, T4e;
Chris@82 1024 T49 = FNMS(KP951056516, T48, KP587785252 * T47);
Chris@82 1025 T4M = FMA(KP951056516, T47, KP587785252 * T48);
Chris@82 1026 T4e = FNMS(KP250000000, T4d, T4a);
Chris@82 1027 T4g = T4e - T4f;
Chris@82 1028 T4L = T4f + T4e;
Chris@82 1029 T4h = T49 + T4g;
Chris@82 1030 T4Z = T4M + T4L;
Chris@82 1031 T4x = T4g - T49;
Chris@82 1032 T4N = T4L - T4M;
Chris@82 1033 }
Chris@82 1034 {
Chris@82 1035 E T44, T4I, T3X, T4H, T3V;
Chris@82 1036 T44 = FNMS(KP951056516, T43, KP587785252 * T40);
Chris@82 1037 T4I = FMA(KP951056516, T40, KP587785252 * T43);
Chris@82 1038 T3V = FNMS(KP250000000, T3U, T3N);
Chris@82 1039 T3X = T3V - T3W;
Chris@82 1040 T4H = T3W + T3V;
Chris@82 1041 T45 = T3X - T44;
Chris@82 1042 T4X = T4H - T4I;
Chris@82 1043 T4v = T3X + T44;
Chris@82 1044 T4J = T4H + T4I;
Chris@82 1045 }
Chris@82 1046 {
Chris@82 1047 E T3C, T4q, T3J, T4r, T3H;
Chris@82 1048 T3C = FNMS(KP951056516, T3B, KP587785252 * T3A);
Chris@82 1049 T4q = FMA(KP951056516, T3A, KP587785252 * T3B);
Chris@82 1050 T3H = FNMS(KP250000000, T3G, T3D);
Chris@82 1051 T3J = T3H - T3I;
Chris@82 1052 T4r = T3I + T3H;
Chris@82 1053 T3K = T3C + T3J;
Chris@82 1054 T56 = T4r - T4q;
Chris@82 1055 T4s = T4q + T4r;
Chris@82 1056 T4U = T3J - T3C;
Chris@82 1057 }
Chris@82 1058 {
Chris@82 1059 E T4O, T4Q, T4G, T4K;
Chris@82 1060 T4G = W[20];
Chris@82 1061 T4K = W[21];
Chris@82 1062 T4O = FMA(T4G, T4J, T4K * T4N);
Chris@82 1063 T4Q = FNMS(T4K, T4J, T4G * T4N);
Chris@82 1064 Rp[WS(rs, 5)] = T4F - T4O;
Chris@82 1065 Ip[WS(rs, 5)] = T4P + T4Q;
Chris@82 1066 Rm[WS(rs, 5)] = T4F + T4O;
Chris@82 1067 Im[WS(rs, 5)] = T4Q - T4P;
Chris@82 1068 }
Chris@82 1069 {
Chris@82 1070 E T57, T5d, T53, T55;
Chris@82 1071 T53 = W[26];
Chris@82 1072 T55 = W[27];
Chris@82 1073 T57 = FNMS(T55, T56, T53 * T54);
Chris@82 1074 T5d = FMA(T55, T54, T53 * T56);
Chris@82 1075 Rp[WS(rs, 7)] = T57 - T5c;
Chris@82 1076 Ip[WS(rs, 7)] = T5d + T5e;
Chris@82 1077 Rm[WS(rs, 7)] = T57 + T5c;
Chris@82 1078 Im[WS(rs, 7)] = T5e - T5d;
Chris@82 1079 }
Chris@82 1080 {
Chris@82 1081 E T3L, T4j, T4i, T4k;
Chris@82 1082 {
Chris@82 1083 E T3f, T3z, T3M, T46;
Chris@82 1084 T3f = W[2];
Chris@82 1085 T3z = W[3];
Chris@82 1086 T3L = FNMS(T3z, T3K, T3f * T3y);
Chris@82 1087 T4j = FMA(T3z, T3y, T3f * T3K);
Chris@82 1088 T3M = W[4];
Chris@82 1089 T46 = W[5];
Chris@82 1090 T4i = FMA(T3M, T45, T46 * T4h);
Chris@82 1091 T4k = FNMS(T46, T45, T3M * T4h);
Chris@82 1092 }
Chris@82 1093 Rp[WS(rs, 1)] = T3L - T4i;
Chris@82 1094 Ip[WS(rs, 1)] = T4j + T4k;
Chris@82 1095 Rm[WS(rs, 1)] = T3L + T4i;
Chris@82 1096 Im[WS(rs, 1)] = T4k - T4j;
Chris@82 1097 }
Chris@82 1098 {
Chris@82 1099 E T4t, T4z, T4y, T4A;
Chris@82 1100 {
Chris@82 1101 E T4l, T4p, T4u, T4w;
Chris@82 1102 T4l = W[10];
Chris@82 1103 T4p = W[11];
Chris@82 1104 T4t = FNMS(T4p, T4s, T4l * T4o);
Chris@82 1105 T4z = FMA(T4p, T4o, T4l * T4s);
Chris@82 1106 T4u = W[12];
Chris@82 1107 T4w = W[13];
Chris@82 1108 T4y = FMA(T4u, T4v, T4w * T4x);
Chris@82 1109 T4A = FNMS(T4w, T4v, T4u * T4x);
Chris@82 1110 }
Chris@82 1111 Rp[WS(rs, 3)] = T4t - T4y;
Chris@82 1112 Ip[WS(rs, 3)] = T4z + T4A;
Chris@82 1113 Rm[WS(rs, 3)] = T4t + T4y;
Chris@82 1114 Im[WS(rs, 3)] = T4A - T4z;
Chris@82 1115 }
Chris@82 1116 {
Chris@82 1117 E T4V, T51, T50, T52;
Chris@82 1118 {
Chris@82 1119 E T4R, T4T, T4W, T4Y;
Chris@82 1120 T4R = W[34];
Chris@82 1121 T4T = W[35];
Chris@82 1122 T4V = FNMS(T4T, T4U, T4R * T4S);
Chris@82 1123 T51 = FMA(T4T, T4S, T4R * T4U);
Chris@82 1124 T4W = W[36];
Chris@82 1125 T4Y = W[37];
Chris@82 1126 T50 = FMA(T4W, T4X, T4Y * T4Z);
Chris@82 1127 T52 = FNMS(T4Y, T4X, T4W * T4Z);
Chris@82 1128 }
Chris@82 1129 Rp[WS(rs, 9)] = T4V - T50;
Chris@82 1130 Ip[WS(rs, 9)] = T51 + T52;
Chris@82 1131 Rm[WS(rs, 9)] = T4V + T50;
Chris@82 1132 Im[WS(rs, 9)] = T52 - T51;
Chris@82 1133 }
Chris@82 1134 }
Chris@82 1135 }
Chris@82 1136 }
Chris@82 1137 }
Chris@82 1138
Chris@82 1139 static const tw_instr twinstr[] = {
Chris@82 1140 {TW_FULL, 1, 20},
Chris@82 1141 {TW_NEXT, 1, 0}
Chris@82 1142 };
Chris@82 1143
Chris@82 1144 static const hc2c_desc desc = { 20, "hc2cbdft2_20", twinstr, &GENUS, {224, 62, 62, 0} };
Chris@82 1145
Chris@82 1146 void X(codelet_hc2cbdft2_20) (planner *p) {
Chris@82 1147 X(khc2c_register) (p, hc2cbdft2_20, &desc, HC2C_VIA_DFT);
Chris@82 1148 }
Chris@82 1149 #endif