annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cb_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:35 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cb_20 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 246 FP additions, 148 FP multiplications,
Chris@42 32 * (or, 136 additions, 38 multiplications, 110 fused multiply/add),
Chris@42 33 * 112 stack variables, 4 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cb_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 46 E T1T, T1Q, T1P;
Chris@42 47 {
Chris@42 48 E T3z, T4z, TE, T7, T2W, T4e, T2l, T1t, T33, T3H, T3G, T3a, T1i, T2g, T13;
Chris@42 49 E T4H, T4G, T2d, T1B, T4u, T4B, T4A, T4r, T1A, T2s, T3l, T2t, T3s, T2o, T2q;
Chris@42 50 E T1w, T1y, TC, T29, T3E, T3C, T4n, T4l, TN, TL;
Chris@42 51 {
Chris@42 52 E T4, T2U, T3, T3x, T1p, T5, T1q, T1r;
Chris@42 53 {
Chris@42 54 E T1, T2, T1n, T1o;
Chris@42 55 T1 = Rp[0];
Chris@42 56 T2 = Rm[WS(rs, 9)];
Chris@42 57 T1n = Ip[0];
Chris@42 58 T1o = Im[WS(rs, 9)];
Chris@42 59 T4 = Rp[WS(rs, 5)];
Chris@42 60 T2U = T1 - T2;
Chris@42 61 T3 = T1 + T2;
Chris@42 62 T3x = T1n + T1o;
Chris@42 63 T1p = T1n - T1o;
Chris@42 64 T5 = Rm[WS(rs, 4)];
Chris@42 65 T1q = Ip[WS(rs, 5)];
Chris@42 66 T1r = Im[WS(rs, 4)];
Chris@42 67 }
Chris@42 68 {
Chris@42 69 E T3o, T4p, TF, Te, T2Z, T4f, T2b, T1a, T3k, T4t, TJ, TA, T39, T4j, T2f;
Chris@42 70 E T12, T3r, T4q, TG, Tl, T32, T4g, T2c, T1h, Tq, T34, Tp, T3f, TR, Tr;
Chris@42 71 E TS, TT;
Chris@42 72 {
Chris@42 73 E Tx, T37, Tw, T3j, TY, Ty, TZ, T10;
Chris@42 74 {
Chris@42 75 E Tb, T2X, Ta, T3m, T16, Tc, T17, T18;
Chris@42 76 {
Chris@42 77 E T8, T9, T14, T15;
Chris@42 78 T8 = Rp[WS(rs, 4)];
Chris@42 79 {
Chris@42 80 E T3y, T6, T2V, T1s;
Chris@42 81 T3y = T4 - T5;
Chris@42 82 T6 = T4 + T5;
Chris@42 83 T2V = T1q + T1r;
Chris@42 84 T1s = T1q - T1r;
Chris@42 85 T3z = T3x - T3y;
Chris@42 86 T4z = T3y + T3x;
Chris@42 87 TE = T3 - T6;
Chris@42 88 T7 = T3 + T6;
Chris@42 89 T2W = T2U + T2V;
Chris@42 90 T4e = T2U - T2V;
Chris@42 91 T2l = T1p + T1s;
Chris@42 92 T1t = T1p - T1s;
Chris@42 93 T9 = Rm[WS(rs, 5)];
Chris@42 94 }
Chris@42 95 T14 = Ip[WS(rs, 4)];
Chris@42 96 T15 = Im[WS(rs, 5)];
Chris@42 97 Tb = Rp[WS(rs, 9)];
Chris@42 98 T2X = T8 - T9;
Chris@42 99 Ta = T8 + T9;
Chris@42 100 T3m = T14 + T15;
Chris@42 101 T16 = T14 - T15;
Chris@42 102 Tc = Rm[0];
Chris@42 103 T17 = Ip[WS(rs, 9)];
Chris@42 104 T18 = Im[0];
Chris@42 105 }
Chris@42 106 {
Chris@42 107 E Tu, Tv, TW, TX;
Chris@42 108 Tu = Rm[WS(rs, 7)];
Chris@42 109 {
Chris@42 110 E T3n, Td, T2Y, T19;
Chris@42 111 T3n = Tb - Tc;
Chris@42 112 Td = Tb + Tc;
Chris@42 113 T2Y = T17 + T18;
Chris@42 114 T19 = T17 - T18;
Chris@42 115 T3o = T3m - T3n;
Chris@42 116 T4p = T3n + T3m;
Chris@42 117 TF = Ta - Td;
Chris@42 118 Te = Ta + Td;
Chris@42 119 T2Z = T2X + T2Y;
Chris@42 120 T4f = T2X - T2Y;
Chris@42 121 T2b = T16 + T19;
Chris@42 122 T1a = T16 - T19;
Chris@42 123 Tv = Rp[WS(rs, 2)];
Chris@42 124 }
Chris@42 125 TW = Ip[WS(rs, 2)];
Chris@42 126 TX = Im[WS(rs, 7)];
Chris@42 127 Tx = Rm[WS(rs, 2)];
Chris@42 128 T37 = Tu - Tv;
Chris@42 129 Tw = Tu + Tv;
Chris@42 130 T3j = TW + TX;
Chris@42 131 TY = TW - TX;
Chris@42 132 Ty = Rp[WS(rs, 7)];
Chris@42 133 TZ = Ip[WS(rs, 7)];
Chris@42 134 T10 = Im[WS(rs, 2)];
Chris@42 135 }
Chris@42 136 }
Chris@42 137 {
Chris@42 138 E Ti, T30, Th, T3q, T1d, Tj, T1e, T1f;
Chris@42 139 {
Chris@42 140 E Tf, Tg, T1b, T1c;
Chris@42 141 Tf = Rm[WS(rs, 3)];
Chris@42 142 {
Chris@42 143 E T3i, Tz, T38, T11;
Chris@42 144 T3i = Tx - Ty;
Chris@42 145 Tz = Tx + Ty;
Chris@42 146 T38 = TZ + T10;
Chris@42 147 T11 = TZ - T10;
Chris@42 148 T3k = T3i + T3j;
Chris@42 149 T4t = T3i - T3j;
Chris@42 150 TJ = Tw - Tz;
Chris@42 151 TA = Tw + Tz;
Chris@42 152 T39 = T37 - T38;
Chris@42 153 T4j = T37 + T38;
Chris@42 154 T2f = TY + T11;
Chris@42 155 T12 = TY - T11;
Chris@42 156 Tg = Rp[WS(rs, 6)];
Chris@42 157 }
Chris@42 158 T1b = Ip[WS(rs, 6)];
Chris@42 159 T1c = Im[WS(rs, 3)];
Chris@42 160 Ti = Rp[WS(rs, 1)];
Chris@42 161 T30 = Tf - Tg;
Chris@42 162 Th = Tf + Tg;
Chris@42 163 T3q = T1b + T1c;
Chris@42 164 T1d = T1b - T1c;
Chris@42 165 Tj = Rm[WS(rs, 8)];
Chris@42 166 T1e = Ip[WS(rs, 1)];
Chris@42 167 T1f = Im[WS(rs, 8)];
Chris@42 168 }
Chris@42 169 {
Chris@42 170 E Tn, To, TP, TQ;
Chris@42 171 Tn = Rp[WS(rs, 8)];
Chris@42 172 {
Chris@42 173 E T3p, Tk, T31, T1g;
Chris@42 174 T3p = Ti - Tj;
Chris@42 175 Tk = Ti + Tj;
Chris@42 176 T31 = T1e + T1f;
Chris@42 177 T1g = T1e - T1f;
Chris@42 178 T3r = T3p + T3q;
Chris@42 179 T4q = T3p - T3q;
Chris@42 180 TG = Th - Tk;
Chris@42 181 Tl = Th + Tk;
Chris@42 182 T32 = T30 + T31;
Chris@42 183 T4g = T30 - T31;
Chris@42 184 T2c = T1d + T1g;
Chris@42 185 T1h = T1d - T1g;
Chris@42 186 To = Rm[WS(rs, 1)];
Chris@42 187 }
Chris@42 188 TP = Ip[WS(rs, 8)];
Chris@42 189 TQ = Im[WS(rs, 1)];
Chris@42 190 Tq = Rm[WS(rs, 6)];
Chris@42 191 T34 = Tn - To;
Chris@42 192 Tp = Tn + To;
Chris@42 193 T3f = TP + TQ;
Chris@42 194 TR = TP - TQ;
Chris@42 195 Tr = Rp[WS(rs, 3)];
Chris@42 196 TS = Ip[WS(rs, 3)];
Chris@42 197 TT = Im[WS(rs, 6)];
Chris@42 198 }
Chris@42 199 }
Chris@42 200 }
Chris@42 201 {
Chris@42 202 E T3h, Tt, T1u, T2n, T1v, T4k, T4h, T2m, TH, TK, T4s, TI;
Chris@42 203 T33 = T2Z + T32;
Chris@42 204 T3H = T2Z - T32;
Chris@42 205 {
Chris@42 206 E T3g, Ts, T35, TU;
Chris@42 207 T3g = Tq - Tr;
Chris@42 208 Ts = Tq + Tr;
Chris@42 209 T35 = TS + TT;
Chris@42 210 TU = TS - TT;
Chris@42 211 T3h = T3f - T3g;
Chris@42 212 T4s = T3g + T3f;
Chris@42 213 TI = Tp - Ts;
Chris@42 214 Tt = Tp + Ts;
Chris@42 215 {
Chris@42 216 E T36, T4i, T2e, TV;
Chris@42 217 T36 = T34 - T35;
Chris@42 218 T4i = T34 + T35;
Chris@42 219 T2e = TR + TU;
Chris@42 220 TV = TR - TU;
Chris@42 221 T3G = T36 - T39;
Chris@42 222 T3a = T36 + T39;
Chris@42 223 T1u = T1a + T1h;
Chris@42 224 T1i = T1a - T1h;
Chris@42 225 T2g = T2e - T2f;
Chris@42 226 T2n = T2e + T2f;
Chris@42 227 T1v = TV + T12;
Chris@42 228 T13 = TV - T12;
Chris@42 229 T4H = T4i - T4j;
Chris@42 230 T4k = T4i + T4j;
Chris@42 231 }
Chris@42 232 }
Chris@42 233 T4h = T4f + T4g;
Chris@42 234 T4G = T4f - T4g;
Chris@42 235 T2d = T2b - T2c;
Chris@42 236 T2m = T2b + T2c;
Chris@42 237 TH = TF + TG;
Chris@42 238 T1B = TF - TG;
Chris@42 239 T4u = T4s - T4t;
Chris@42 240 T4B = T4s + T4t;
Chris@42 241 T4A = T4p + T4q;
Chris@42 242 T4r = T4p - T4q;
Chris@42 243 T1A = TI - TJ;
Chris@42 244 TK = TI + TJ;
Chris@42 245 {
Chris@42 246 E Tm, T3B, TB, T3A;
Chris@42 247 Tm = Te + Tl;
Chris@42 248 T2s = Te - Tl;
Chris@42 249 T3l = T3h + T3k;
Chris@42 250 T3B = T3h - T3k;
Chris@42 251 TB = Tt + TA;
Chris@42 252 T2t = Tt - TA;
Chris@42 253 T3s = T3o + T3r;
Chris@42 254 T3A = T3o - T3r;
Chris@42 255 T2o = T2m + T2n;
Chris@42 256 T2q = T2m - T2n;
Chris@42 257 T1w = T1u + T1v;
Chris@42 258 T1y = T1u - T1v;
Chris@42 259 TC = Tm + TB;
Chris@42 260 T29 = Tm - TB;
Chris@42 261 T3E = T3A - T3B;
Chris@42 262 T3C = T3A + T3B;
Chris@42 263 T4n = T4h - T4k;
Chris@42 264 T4l = T4h + T4k;
Chris@42 265 TN = TH - TK;
Chris@42 266 TL = TH + TK;
Chris@42 267 }
Chris@42 268 }
Chris@42 269 }
Chris@42 270 }
Chris@42 271 {
Chris@42 272 E T3d, T3b, T4E, T1x, TM, T4m, T58, T5b, T4D, T5a, T5c, T59, T4C;
Chris@42 273 Rp[0] = T7 + TC;
Chris@42 274 T3d = T33 - T3a;
Chris@42 275 T3b = T33 + T3a;
Chris@42 276 T4E = T4A - T4B;
Chris@42 277 T4C = T4A + T4B;
Chris@42 278 Rm[0] = T2l + T2o;
Chris@42 279 {
Chris@42 280 E T25, T22, T21, T24, T23, T26, T57;
Chris@42 281 T1x = FNMS(KP250000000, T1w, T1t);
Chris@42 282 T25 = T1t + T1w;
Chris@42 283 T22 = TE + TL;
Chris@42 284 TM = FNMS(KP250000000, TL, TE);
Chris@42 285 T21 = W[18];
Chris@42 286 T24 = W[19];
Chris@42 287 T4m = FNMS(KP250000000, T4l, T4e);
Chris@42 288 T58 = T4e + T4l;
Chris@42 289 T5b = T4z + T4C;
Chris@42 290 T4D = FNMS(KP250000000, T4C, T4z);
Chris@42 291 T23 = T21 * T22;
Chris@42 292 T26 = T24 * T22;
Chris@42 293 T57 = W[8];
Chris@42 294 T5a = W[9];
Chris@42 295 Rp[WS(rs, 5)] = FNMS(T24, T25, T23);
Chris@42 296 Rm[WS(rs, 5)] = FMA(T21, T25, T26);
Chris@42 297 T5c = T57 * T5b;
Chris@42 298 T59 = T57 * T58;
Chris@42 299 }
Chris@42 300 {
Chris@42 301 E T3U, T3Z, T3W, T40, T3V;
Chris@42 302 {
Chris@42 303 E T3c, T48, T4b, T3D, T47, T4a;
Chris@42 304 T3c = FNMS(KP250000000, T3b, T2W);
Chris@42 305 T48 = T2W + T3b;
Chris@42 306 T4b = T3z + T3C;
Chris@42 307 T3D = FNMS(KP250000000, T3C, T3z);
Chris@42 308 Im[WS(rs, 2)] = FMA(T5a, T58, T5c);
Chris@42 309 Ip[WS(rs, 2)] = FNMS(T5a, T5b, T59);
Chris@42 310 T47 = W[28];
Chris@42 311 T4a = W[29];
Chris@42 312 {
Chris@42 313 E T3I, T3Y, T42, T3u, T3M, T3X, T3F;
Chris@42 314 {
Chris@42 315 E T3T, T3t, T4c, T49, T3e, T3S;
Chris@42 316 T3T = FMA(KP618033988, T3l, T3s);
Chris@42 317 T3t = FNMS(KP618033988, T3s, T3l);
Chris@42 318 T4c = T47 * T4b;
Chris@42 319 T49 = T47 * T48;
Chris@42 320 T3I = FNMS(KP618033988, T3H, T3G);
Chris@42 321 T3Y = FMA(KP618033988, T3G, T3H);
Chris@42 322 Im[WS(rs, 7)] = FMA(T4a, T48, T4c);
Chris@42 323 Ip[WS(rs, 7)] = FNMS(T4a, T4b, T49);
Chris@42 324 T3e = FNMS(KP559016994, T3d, T3c);
Chris@42 325 T3S = FMA(KP559016994, T3d, T3c);
Chris@42 326 T42 = FMA(KP951056516, T3T, T3S);
Chris@42 327 T3U = FNMS(KP951056516, T3T, T3S);
Chris@42 328 T3u = FNMS(KP951056516, T3t, T3e);
Chris@42 329 T3M = FMA(KP951056516, T3t, T3e);
Chris@42 330 T3X = FMA(KP559016994, T3E, T3D);
Chris@42 331 T3F = FNMS(KP559016994, T3E, T3D);
Chris@42 332 }
Chris@42 333 {
Chris@42 334 E T3P, T45, T44, T46, T43;
Chris@42 335 {
Chris@42 336 E T3w, T3J, T3v, T3K, T2T, T41;
Chris@42 337 T2T = W[4];
Chris@42 338 T3w = W[5];
Chris@42 339 T3J = FMA(KP951056516, T3I, T3F);
Chris@42 340 T3P = FNMS(KP951056516, T3I, T3F);
Chris@42 341 T45 = FNMS(KP951056516, T3Y, T3X);
Chris@42 342 T3Z = FMA(KP951056516, T3Y, T3X);
Chris@42 343 T3v = T2T * T3u;
Chris@42 344 T3K = T2T * T3J;
Chris@42 345 T41 = W[36];
Chris@42 346 T44 = W[37];
Chris@42 347 Ip[WS(rs, 1)] = FNMS(T3w, T3J, T3v);
Chris@42 348 Im[WS(rs, 1)] = FMA(T3w, T3u, T3K);
Chris@42 349 T46 = T41 * T45;
Chris@42 350 T43 = T41 * T42;
Chris@42 351 }
Chris@42 352 {
Chris@42 353 E T3O, T3Q, T3N, T3L, T3R;
Chris@42 354 T3L = W[12];
Chris@42 355 T3O = W[13];
Chris@42 356 Im[WS(rs, 9)] = FMA(T44, T42, T46);
Chris@42 357 Ip[WS(rs, 9)] = FNMS(T44, T45, T43);
Chris@42 358 T3Q = T3L * T3P;
Chris@42 359 T3N = T3L * T3M;
Chris@42 360 T3R = W[20];
Chris@42 361 T3W = W[21];
Chris@42 362 Im[WS(rs, 3)] = FMA(T3O, T3M, T3Q);
Chris@42 363 Ip[WS(rs, 3)] = FNMS(T3O, T3P, T3N);
Chris@42 364 T40 = T3R * T3Z;
Chris@42 365 T3V = T3R * T3U;
Chris@42 366 }
Chris@42 367 }
Chris@42 368 }
Chris@42 369 }
Chris@42 370 {
Chris@42 371 E T4U, T4Z, T4W, T50, T4V, T2L, T2I, T2H;
Chris@42 372 {
Chris@42 373 E T4T, T4v, T4I, T4Y, T4o, T4S;
Chris@42 374 T4T = FNMS(KP618033988, T4r, T4u);
Chris@42 375 T4v = FMA(KP618033988, T4u, T4r);
Chris@42 376 Im[WS(rs, 5)] = FMA(T3W, T3U, T40);
Chris@42 377 Ip[WS(rs, 5)] = FNMS(T3W, T3Z, T3V);
Chris@42 378 T4I = FMA(KP618033988, T4H, T4G);
Chris@42 379 T4Y = FNMS(KP618033988, T4G, T4H);
Chris@42 380 T4o = FMA(KP559016994, T4n, T4m);
Chris@42 381 T4S = FNMS(KP559016994, T4n, T4m);
Chris@42 382 {
Chris@42 383 E T52, T4M, T55, T4P, T54, T56, T53;
Chris@42 384 {
Chris@42 385 E T4d, T4w, T4J, T4x, T4y, T4X, T4F, T51, T4K;
Chris@42 386 T4d = W[0];
Chris@42 387 T4X = FNMS(KP559016994, T4E, T4D);
Chris@42 388 T4F = FMA(KP559016994, T4E, T4D);
Chris@42 389 T4U = FNMS(KP951056516, T4T, T4S);
Chris@42 390 T52 = FMA(KP951056516, T4T, T4S);
Chris@42 391 T4M = FMA(KP951056516, T4v, T4o);
Chris@42 392 T4w = FNMS(KP951056516, T4v, T4o);
Chris@42 393 T4Z = FMA(KP951056516, T4Y, T4X);
Chris@42 394 T55 = FNMS(KP951056516, T4Y, T4X);
Chris@42 395 T4P = FNMS(KP951056516, T4I, T4F);
Chris@42 396 T4J = FMA(KP951056516, T4I, T4F);
Chris@42 397 T4x = T4d * T4w;
Chris@42 398 T4y = W[1];
Chris@42 399 T51 = W[32];
Chris@42 400 T4K = T4d * T4J;
Chris@42 401 T54 = W[33];
Chris@42 402 Ip[0] = FNMS(T4y, T4J, T4x);
Chris@42 403 T56 = T51 * T55;
Chris@42 404 T53 = T51 * T52;
Chris@42 405 Im[0] = FMA(T4y, T4w, T4K);
Chris@42 406 }
Chris@42 407 {
Chris@42 408 E T4O, T4Q, T4N, T4L, T4R;
Chris@42 409 T4L = W[16];
Chris@42 410 Im[WS(rs, 8)] = FMA(T54, T52, T56);
Chris@42 411 Ip[WS(rs, 8)] = FNMS(T54, T55, T53);
Chris@42 412 T4O = W[17];
Chris@42 413 T4Q = T4L * T4P;
Chris@42 414 T4N = T4L * T4M;
Chris@42 415 T4R = W[24];
Chris@42 416 T4W = W[25];
Chris@42 417 Im[WS(rs, 4)] = FMA(T4O, T4M, T4Q);
Chris@42 418 Ip[WS(rs, 4)] = FNMS(T4O, T4P, T4N);
Chris@42 419 T50 = T4R * T4Z;
Chris@42 420 T4V = T4R * T4U;
Chris@42 421 }
Chris@42 422 }
Chris@42 423 }
Chris@42 424 {
Chris@42 425 E T2K, T2u, T2F, T2h, T28, T2J, T2r, T2p;
Chris@42 426 T2K = FNMS(KP618033988, T2s, T2t);
Chris@42 427 T2u = FMA(KP618033988, T2t, T2s);
Chris@42 428 Im[WS(rs, 6)] = FMA(T4W, T4U, T50);
Chris@42 429 Ip[WS(rs, 6)] = FNMS(T4W, T4Z, T4V);
Chris@42 430 T2p = FNMS(KP250000000, T2o, T2l);
Chris@42 431 T2F = FNMS(KP618033988, T2d, T2g);
Chris@42 432 T2h = FMA(KP618033988, T2g, T2d);
Chris@42 433 T28 = FNMS(KP250000000, TC, T7);
Chris@42 434 T2J = FNMS(KP559016994, T2q, T2p);
Chris@42 435 T2r = FMA(KP559016994, T2q, T2p);
Chris@42 436 {
Chris@42 437 E T2B, T2G, T2y, T2R, T2Q, T2P, T2A, T2x;
Chris@42 438 {
Chris@42 439 E T2k, T2v, T27, T2O, T2i, T2a, T2E;
Chris@42 440 T2k = W[7];
Chris@42 441 T2a = FMA(KP559016994, T29, T28);
Chris@42 442 T2E = FNMS(KP559016994, T29, T28);
Chris@42 443 T2B = FMA(KP951056516, T2u, T2r);
Chris@42 444 T2v = FNMS(KP951056516, T2u, T2r);
Chris@42 445 T27 = W[6];
Chris@42 446 T2O = FMA(KP951056516, T2F, T2E);
Chris@42 447 T2G = FNMS(KP951056516, T2F, T2E);
Chris@42 448 T2i = FMA(KP951056516, T2h, T2a);
Chris@42 449 T2y = FNMS(KP951056516, T2h, T2a);
Chris@42 450 {
Chris@42 451 E T2N, T2j, T2w, T2S;
Chris@42 452 T2L = FMA(KP951056516, T2K, T2J);
Chris@42 453 T2R = FNMS(KP951056516, T2K, T2J);
Chris@42 454 T2Q = W[23];
Chris@42 455 T2N = W[22];
Chris@42 456 T2j = T27 * T2i;
Chris@42 457 T2w = T2k * T2i;
Chris@42 458 T2S = T2Q * T2O;
Chris@42 459 T2P = T2N * T2O;
Chris@42 460 Rp[WS(rs, 2)] = FNMS(T2k, T2v, T2j);
Chris@42 461 Rm[WS(rs, 2)] = FMA(T27, T2v, T2w);
Chris@42 462 Rm[WS(rs, 6)] = FMA(T2N, T2R, T2S);
Chris@42 463 }
Chris@42 464 }
Chris@42 465 Rp[WS(rs, 6)] = FNMS(T2Q, T2R, T2P);
Chris@42 466 T2A = W[31];
Chris@42 467 T2x = W[30];
Chris@42 468 {
Chris@42 469 E T2D, T2M, T2C, T2z;
Chris@42 470 T2I = W[15];
Chris@42 471 T2C = T2A * T2y;
Chris@42 472 T2z = T2x * T2y;
Chris@42 473 T2D = W[14];
Chris@42 474 T2M = T2I * T2G;
Chris@42 475 Rm[WS(rs, 8)] = FMA(T2x, T2B, T2C);
Chris@42 476 Rp[WS(rs, 8)] = FNMS(T2A, T2B, T2z);
Chris@42 477 T2H = T2D * T2G;
Chris@42 478 Rm[WS(rs, 4)] = FMA(T2D, T2L, T2M);
Chris@42 479 }
Chris@42 480 }
Chris@42 481 }
Chris@42 482 {
Chris@42 483 E T1S, T1C, T1j, T1N, T1z, T1R;
Chris@42 484 T1S = FMA(KP618033988, T1A, T1B);
Chris@42 485 T1C = FNMS(KP618033988, T1B, T1A);
Chris@42 486 Rp[WS(rs, 4)] = FNMS(T2I, T2L, T2H);
Chris@42 487 T1j = FNMS(KP618033988, T1i, T13);
Chris@42 488 T1N = FMA(KP618033988, T13, T1i);
Chris@42 489 T1z = FNMS(KP559016994, T1y, T1x);
Chris@42 490 T1R = FMA(KP559016994, T1y, T1x);
Chris@42 491 {
Chris@42 492 E T1J, T1O, T1G, T1Z, T1Y, T1X, T1I, T1F;
Chris@42 493 {
Chris@42 494 E T1m, T1D, TD, T1W, T1k, T1M, TO;
Chris@42 495 T1m = W[3];
Chris@42 496 T1M = FMA(KP559016994, TN, TM);
Chris@42 497 TO = FNMS(KP559016994, TN, TM);
Chris@42 498 T1D = FNMS(KP951056516, T1C, T1z);
Chris@42 499 T1J = FMA(KP951056516, T1C, T1z);
Chris@42 500 TD = W[2];
Chris@42 501 T1O = FNMS(KP951056516, T1N, T1M);
Chris@42 502 T1W = FMA(KP951056516, T1N, T1M);
Chris@42 503 T1G = FNMS(KP951056516, T1j, TO);
Chris@42 504 T1k = FMA(KP951056516, T1j, TO);
Chris@42 505 {
Chris@42 506 E T1V, T1l, T1E, T20;
Chris@42 507 T1Z = FNMS(KP951056516, T1S, T1R);
Chris@42 508 T1T = FMA(KP951056516, T1S, T1R);
Chris@42 509 T1Y = W[27];
Chris@42 510 T1V = W[26];
Chris@42 511 T1l = TD * T1k;
Chris@42 512 T1E = T1m * T1k;
Chris@42 513 T20 = T1Y * T1W;
Chris@42 514 T1X = T1V * T1W;
Chris@42 515 Rp[WS(rs, 1)] = FNMS(T1m, T1D, T1l);
Chris@42 516 Rm[WS(rs, 1)] = FMA(TD, T1D, T1E);
Chris@42 517 Rm[WS(rs, 7)] = FMA(T1V, T1Z, T20);
Chris@42 518 }
Chris@42 519 }
Chris@42 520 Rp[WS(rs, 7)] = FNMS(T1Y, T1Z, T1X);
Chris@42 521 T1I = W[35];
Chris@42 522 T1F = W[34];
Chris@42 523 {
Chris@42 524 E T1L, T1U, T1K, T1H;
Chris@42 525 T1Q = W[11];
Chris@42 526 T1K = T1I * T1G;
Chris@42 527 T1H = T1F * T1G;
Chris@42 528 T1L = W[10];
Chris@42 529 T1U = T1Q * T1O;
Chris@42 530 Rm[WS(rs, 9)] = FMA(T1F, T1J, T1K);
Chris@42 531 Rp[WS(rs, 9)] = FNMS(T1I, T1J, T1H);
Chris@42 532 T1P = T1L * T1O;
Chris@42 533 Rm[WS(rs, 3)] = FMA(T1L, T1T, T1U);
Chris@42 534 }
Chris@42 535 }
Chris@42 536 }
Chris@42 537 }
Chris@42 538 }
Chris@42 539 }
Chris@42 540 }
Chris@42 541 Rp[WS(rs, 3)] = FNMS(T1Q, T1T, T1P);
Chris@42 542 }
Chris@42 543 }
Chris@42 544 }
Chris@42 545
Chris@42 546 static const tw_instr twinstr[] = {
Chris@42 547 {TW_FULL, 1, 20},
Chris@42 548 {TW_NEXT, 1, 0}
Chris@42 549 };
Chris@42 550
Chris@42 551 static const hc2c_desc desc = { 20, "hc2cb_20", twinstr, &GENUS, {136, 38, 110, 0} };
Chris@42 552
Chris@42 553 void X(codelet_hc2cb_20) (planner *p) {
Chris@42 554 X(khc2c_register) (p, hc2cb_20, &desc, HC2C_VIA_RDFT);
Chris@42 555 }
Chris@42 556 #else /* HAVE_FMA */
Chris@42 557
Chris@42 558 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -dif -name hc2cb_20 -include hc2cb.h */
Chris@42 559
Chris@42 560 /*
Chris@42 561 * This function contains 246 FP additions, 124 FP multiplications,
Chris@42 562 * (or, 184 additions, 62 multiplications, 62 fused multiply/add),
Chris@42 563 * 97 stack variables, 4 constants, and 80 memory accesses
Chris@42 564 */
Chris@42 565 #include "hc2cb.h"
Chris@42 566
Chris@42 567 static void hc2cb_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 568 {
Chris@42 569 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 570 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 571 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 572 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 573 {
Chris@42 574 INT m;
Chris@42 575 for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 576 E T7, T3T, T49, TE, T1v, T2T, T3g, T2d, T13, T3n, T3o, T1i, T26, T4e, T4d;
Chris@42 577 E T23, T1n, T42, T3Z, T1m, T2h, T2I, T2i, T2P, T30, T37, T38, Tm, TB, TC;
Chris@42 578 E T46, T47, T4a, T2a, T2b, T2e, T1w, T1x, T1y, T3O, T3R, T3U, T3h, T3i, T3j;
Chris@42 579 E TH, TK, TL;
Chris@42 580 {
Chris@42 581 E T3, T2R, T1r, T3e, T6, T3f, T1u, T2S;
Chris@42 582 {
Chris@42 583 E T1, T2, T1p, T1q;
Chris@42 584 T1 = Rp[0];
Chris@42 585 T2 = Rm[WS(rs, 9)];
Chris@42 586 T3 = T1 + T2;
Chris@42 587 T2R = T1 - T2;
Chris@42 588 T1p = Ip[0];
Chris@42 589 T1q = Im[WS(rs, 9)];
Chris@42 590 T1r = T1p - T1q;
Chris@42 591 T3e = T1p + T1q;
Chris@42 592 }
Chris@42 593 {
Chris@42 594 E T4, T5, T1s, T1t;
Chris@42 595 T4 = Rp[WS(rs, 5)];
Chris@42 596 T5 = Rm[WS(rs, 4)];
Chris@42 597 T6 = T4 + T5;
Chris@42 598 T3f = T4 - T5;
Chris@42 599 T1s = Ip[WS(rs, 5)];
Chris@42 600 T1t = Im[WS(rs, 4)];
Chris@42 601 T1u = T1s - T1t;
Chris@42 602 T2S = T1s + T1t;
Chris@42 603 }
Chris@42 604 T7 = T3 + T6;
Chris@42 605 T3T = T2R - T2S;
Chris@42 606 T49 = T3f + T3e;
Chris@42 607 TE = T3 - T6;
Chris@42 608 T1v = T1r - T1u;
Chris@42 609 T2T = T2R + T2S;
Chris@42 610 T3g = T3e - T3f;
Chris@42 611 T2d = T1r + T1u;
Chris@42 612 }
Chris@42 613 {
Chris@42 614 E Te, T3M, T3X, TF, TV, T2E, T2W, T21, TA, T3Q, T41, TJ, T1h, T2O, T36;
Chris@42 615 E T25, Tl, T3N, T3Y, TG, T12, T2H, T2Z, T22, Tt, T3P, T40, TI, T1a, T2L;
Chris@42 616 E T33, T24;
Chris@42 617 {
Chris@42 618 E Ta, T2U, TR, T2C, Td, T2D, TU, T2V;
Chris@42 619 {
Chris@42 620 E T8, T9, TP, TQ;
Chris@42 621 T8 = Rp[WS(rs, 4)];
Chris@42 622 T9 = Rm[WS(rs, 5)];
Chris@42 623 Ta = T8 + T9;
Chris@42 624 T2U = T8 - T9;
Chris@42 625 TP = Ip[WS(rs, 4)];
Chris@42 626 TQ = Im[WS(rs, 5)];
Chris@42 627 TR = TP - TQ;
Chris@42 628 T2C = TP + TQ;
Chris@42 629 }
Chris@42 630 {
Chris@42 631 E Tb, Tc, TS, TT;
Chris@42 632 Tb = Rp[WS(rs, 9)];
Chris@42 633 Tc = Rm[0];
Chris@42 634 Td = Tb + Tc;
Chris@42 635 T2D = Tb - Tc;
Chris@42 636 TS = Ip[WS(rs, 9)];
Chris@42 637 TT = Im[0];
Chris@42 638 TU = TS - TT;
Chris@42 639 T2V = TS + TT;
Chris@42 640 }
Chris@42 641 Te = Ta + Td;
Chris@42 642 T3M = T2U - T2V;
Chris@42 643 T3X = T2D + T2C;
Chris@42 644 TF = Ta - Td;
Chris@42 645 TV = TR - TU;
Chris@42 646 T2E = T2C - T2D;
Chris@42 647 T2W = T2U + T2V;
Chris@42 648 T21 = TR + TU;
Chris@42 649 }
Chris@42 650 {
Chris@42 651 E Tw, T34, T1d, T2N, Tz, T2M, T1g, T35;
Chris@42 652 {
Chris@42 653 E Tu, Tv, T1b, T1c;
Chris@42 654 Tu = Rm[WS(rs, 7)];
Chris@42 655 Tv = Rp[WS(rs, 2)];
Chris@42 656 Tw = Tu + Tv;
Chris@42 657 T34 = Tu - Tv;
Chris@42 658 T1b = Ip[WS(rs, 2)];
Chris@42 659 T1c = Im[WS(rs, 7)];
Chris@42 660 T1d = T1b - T1c;
Chris@42 661 T2N = T1b + T1c;
Chris@42 662 }
Chris@42 663 {
Chris@42 664 E Tx, Ty, T1e, T1f;
Chris@42 665 Tx = Rm[WS(rs, 2)];
Chris@42 666 Ty = Rp[WS(rs, 7)];
Chris@42 667 Tz = Tx + Ty;
Chris@42 668 T2M = Tx - Ty;
Chris@42 669 T1e = Ip[WS(rs, 7)];
Chris@42 670 T1f = Im[WS(rs, 2)];
Chris@42 671 T1g = T1e - T1f;
Chris@42 672 T35 = T1e + T1f;
Chris@42 673 }
Chris@42 674 TA = Tw + Tz;
Chris@42 675 T3Q = T34 + T35;
Chris@42 676 T41 = T2M - T2N;
Chris@42 677 TJ = Tw - Tz;
Chris@42 678 T1h = T1d - T1g;
Chris@42 679 T2O = T2M + T2N;
Chris@42 680 T36 = T34 - T35;
Chris@42 681 T25 = T1d + T1g;
Chris@42 682 }
Chris@42 683 {
Chris@42 684 E Th, T2X, TY, T2G, Tk, T2F, T11, T2Y;
Chris@42 685 {
Chris@42 686 E Tf, Tg, TW, TX;
Chris@42 687 Tf = Rm[WS(rs, 3)];
Chris@42 688 Tg = Rp[WS(rs, 6)];
Chris@42 689 Th = Tf + Tg;
Chris@42 690 T2X = Tf - Tg;
Chris@42 691 TW = Ip[WS(rs, 6)];
Chris@42 692 TX = Im[WS(rs, 3)];
Chris@42 693 TY = TW - TX;
Chris@42 694 T2G = TW + TX;
Chris@42 695 }
Chris@42 696 {
Chris@42 697 E Ti, Tj, TZ, T10;
Chris@42 698 Ti = Rp[WS(rs, 1)];
Chris@42 699 Tj = Rm[WS(rs, 8)];
Chris@42 700 Tk = Ti + Tj;
Chris@42 701 T2F = Ti - Tj;
Chris@42 702 TZ = Ip[WS(rs, 1)];
Chris@42 703 T10 = Im[WS(rs, 8)];
Chris@42 704 T11 = TZ - T10;
Chris@42 705 T2Y = TZ + T10;
Chris@42 706 }
Chris@42 707 Tl = Th + Tk;
Chris@42 708 T3N = T2X - T2Y;
Chris@42 709 T3Y = T2F - T2G;
Chris@42 710 TG = Th - Tk;
Chris@42 711 T12 = TY - T11;
Chris@42 712 T2H = T2F + T2G;
Chris@42 713 T2Z = T2X + T2Y;
Chris@42 714 T22 = TY + T11;
Chris@42 715 }
Chris@42 716 {
Chris@42 717 E Tp, T31, T16, T2J, Ts, T2K, T19, T32;
Chris@42 718 {
Chris@42 719 E Tn, To, T14, T15;
Chris@42 720 Tn = Rp[WS(rs, 8)];
Chris@42 721 To = Rm[WS(rs, 1)];
Chris@42 722 Tp = Tn + To;
Chris@42 723 T31 = Tn - To;
Chris@42 724 T14 = Ip[WS(rs, 8)];
Chris@42 725 T15 = Im[WS(rs, 1)];
Chris@42 726 T16 = T14 - T15;
Chris@42 727 T2J = T14 + T15;
Chris@42 728 }
Chris@42 729 {
Chris@42 730 E Tq, Tr, T17, T18;
Chris@42 731 Tq = Rm[WS(rs, 6)];
Chris@42 732 Tr = Rp[WS(rs, 3)];
Chris@42 733 Ts = Tq + Tr;
Chris@42 734 T2K = Tq - Tr;
Chris@42 735 T17 = Ip[WS(rs, 3)];
Chris@42 736 T18 = Im[WS(rs, 6)];
Chris@42 737 T19 = T17 - T18;
Chris@42 738 T32 = T17 + T18;
Chris@42 739 }
Chris@42 740 Tt = Tp + Ts;
Chris@42 741 T3P = T31 + T32;
Chris@42 742 T40 = T2K + T2J;
Chris@42 743 TI = Tp - Ts;
Chris@42 744 T1a = T16 - T19;
Chris@42 745 T2L = T2J - T2K;
Chris@42 746 T33 = T31 - T32;
Chris@42 747 T24 = T16 + T19;
Chris@42 748 }
Chris@42 749 T13 = TV - T12;
Chris@42 750 T3n = T2W - T2Z;
Chris@42 751 T3o = T33 - T36;
Chris@42 752 T1i = T1a - T1h;
Chris@42 753 T26 = T24 - T25;
Chris@42 754 T4e = T3P - T3Q;
Chris@42 755 T4d = T3M - T3N;
Chris@42 756 T23 = T21 - T22;
Chris@42 757 T1n = TI - TJ;
Chris@42 758 T42 = T40 - T41;
Chris@42 759 T3Z = T3X - T3Y;
Chris@42 760 T1m = TF - TG;
Chris@42 761 T2h = Te - Tl;
Chris@42 762 T2I = T2E + T2H;
Chris@42 763 T2i = Tt - TA;
Chris@42 764 T2P = T2L + T2O;
Chris@42 765 T30 = T2W + T2Z;
Chris@42 766 T37 = T33 + T36;
Chris@42 767 T38 = T30 + T37;
Chris@42 768 Tm = Te + Tl;
Chris@42 769 TB = Tt + TA;
Chris@42 770 TC = Tm + TB;
Chris@42 771 T46 = T3X + T3Y;
Chris@42 772 T47 = T40 + T41;
Chris@42 773 T4a = T46 + T47;
Chris@42 774 T2a = T21 + T22;
Chris@42 775 T2b = T24 + T25;
Chris@42 776 T2e = T2a + T2b;
Chris@42 777 T1w = TV + T12;
Chris@42 778 T1x = T1a + T1h;
Chris@42 779 T1y = T1w + T1x;
Chris@42 780 T3O = T3M + T3N;
Chris@42 781 T3R = T3P + T3Q;
Chris@42 782 T3U = T3O + T3R;
Chris@42 783 T3h = T2E - T2H;
Chris@42 784 T3i = T2L - T2O;
Chris@42 785 T3j = T3h + T3i;
Chris@42 786 TH = TF + TG;
Chris@42 787 TK = TI + TJ;
Chris@42 788 TL = TH + TK;
Chris@42 789 }
Chris@42 790 Rp[0] = T7 + TC;
Chris@42 791 Rm[0] = T2d + T2e;
Chris@42 792 {
Chris@42 793 E T1U, T1W, T1T, T1V;
Chris@42 794 T1U = TE + TL;
Chris@42 795 T1W = T1v + T1y;
Chris@42 796 T1T = W[18];
Chris@42 797 T1V = W[19];
Chris@42 798 Rp[WS(rs, 5)] = FNMS(T1V, T1W, T1T * T1U);
Chris@42 799 Rm[WS(rs, 5)] = FMA(T1V, T1U, T1T * T1W);
Chris@42 800 }
Chris@42 801 {
Chris@42 802 E T4y, T4A, T4x, T4z;
Chris@42 803 T4y = T3T + T3U;
Chris@42 804 T4A = T49 + T4a;
Chris@42 805 T4x = W[8];
Chris@42 806 T4z = W[9];
Chris@42 807 Ip[WS(rs, 2)] = FNMS(T4z, T4A, T4x * T4y);
Chris@42 808 Im[WS(rs, 2)] = FMA(T4x, T4A, T4z * T4y);
Chris@42 809 }
Chris@42 810 {
Chris@42 811 E T3I, T3K, T3H, T3J;
Chris@42 812 T3I = T2T + T38;
Chris@42 813 T3K = T3g + T3j;
Chris@42 814 T3H = W[28];
Chris@42 815 T3J = W[29];
Chris@42 816 Ip[WS(rs, 7)] = FNMS(T3J, T3K, T3H * T3I);
Chris@42 817 Im[WS(rs, 7)] = FMA(T3H, T3K, T3J * T3I);
Chris@42 818 }
Chris@42 819 {
Chris@42 820 E T27, T2j, T2v, T2r, T2g, T2u, T20, T2q;
Chris@42 821 T27 = FMA(KP951056516, T23, KP587785252 * T26);
Chris@42 822 T2j = FMA(KP951056516, T2h, KP587785252 * T2i);
Chris@42 823 T2v = FNMS(KP951056516, T2i, KP587785252 * T2h);
Chris@42 824 T2r = FNMS(KP951056516, T26, KP587785252 * T23);
Chris@42 825 {
Chris@42 826 E T2c, T2f, T1Y, T1Z;
Chris@42 827 T2c = KP559016994 * (T2a - T2b);
Chris@42 828 T2f = FNMS(KP250000000, T2e, T2d);
Chris@42 829 T2g = T2c + T2f;
Chris@42 830 T2u = T2f - T2c;
Chris@42 831 T1Y = KP559016994 * (Tm - TB);
Chris@42 832 T1Z = FNMS(KP250000000, TC, T7);
Chris@42 833 T20 = T1Y + T1Z;
Chris@42 834 T2q = T1Z - T1Y;
Chris@42 835 }
Chris@42 836 {
Chris@42 837 E T28, T2k, T1X, T29;
Chris@42 838 T28 = T20 + T27;
Chris@42 839 T2k = T2g - T2j;
Chris@42 840 T1X = W[6];
Chris@42 841 T29 = W[7];
Chris@42 842 Rp[WS(rs, 2)] = FNMS(T29, T2k, T1X * T28);
Chris@42 843 Rm[WS(rs, 2)] = FMA(T29, T28, T1X * T2k);
Chris@42 844 }
Chris@42 845 {
Chris@42 846 E T2y, T2A, T2x, T2z;
Chris@42 847 T2y = T2q - T2r;
Chris@42 848 T2A = T2v + T2u;
Chris@42 849 T2x = W[22];
Chris@42 850 T2z = W[23];
Chris@42 851 Rp[WS(rs, 6)] = FNMS(T2z, T2A, T2x * T2y);
Chris@42 852 Rm[WS(rs, 6)] = FMA(T2z, T2y, T2x * T2A);
Chris@42 853 }
Chris@42 854 {
Chris@42 855 E T2m, T2o, T2l, T2n;
Chris@42 856 T2m = T20 - T27;
Chris@42 857 T2o = T2j + T2g;
Chris@42 858 T2l = W[30];
Chris@42 859 T2n = W[31];
Chris@42 860 Rp[WS(rs, 8)] = FNMS(T2n, T2o, T2l * T2m);
Chris@42 861 Rm[WS(rs, 8)] = FMA(T2n, T2m, T2l * T2o);
Chris@42 862 }
Chris@42 863 {
Chris@42 864 E T2s, T2w, T2p, T2t;
Chris@42 865 T2s = T2q + T2r;
Chris@42 866 T2w = T2u - T2v;
Chris@42 867 T2p = W[14];
Chris@42 868 T2t = W[15];
Chris@42 869 Rp[WS(rs, 4)] = FNMS(T2t, T2w, T2p * T2s);
Chris@42 870 Rm[WS(rs, 4)] = FMA(T2t, T2s, T2p * T2w);
Chris@42 871 }
Chris@42 872 }
Chris@42 873 {
Chris@42 874 E T43, T4f, T4r, T4m, T4c, T4q, T3W, T4n;
Chris@42 875 T43 = FMA(KP951056516, T3Z, KP587785252 * T42);
Chris@42 876 T4f = FMA(KP951056516, T4d, KP587785252 * T4e);
Chris@42 877 T4r = FNMS(KP951056516, T4e, KP587785252 * T4d);
Chris@42 878 T4m = FNMS(KP951056516, T42, KP587785252 * T3Z);
Chris@42 879 {
Chris@42 880 E T48, T4b, T3S, T3V;
Chris@42 881 T48 = KP559016994 * (T46 - T47);
Chris@42 882 T4b = FNMS(KP250000000, T4a, T49);
Chris@42 883 T4c = T48 + T4b;
Chris@42 884 T4q = T4b - T48;
Chris@42 885 T3S = KP559016994 * (T3O - T3R);
Chris@42 886 T3V = FNMS(KP250000000, T3U, T3T);
Chris@42 887 T3W = T3S + T3V;
Chris@42 888 T4n = T3V - T3S;
Chris@42 889 }
Chris@42 890 {
Chris@42 891 E T44, T4g, T3L, T45;
Chris@42 892 T44 = T3W - T43;
Chris@42 893 T4g = T4c + T4f;
Chris@42 894 T3L = W[0];
Chris@42 895 T45 = W[1];
Chris@42 896 Ip[0] = FNMS(T45, T4g, T3L * T44);
Chris@42 897 Im[0] = FMA(T3L, T4g, T45 * T44);
Chris@42 898 }
Chris@42 899 {
Chris@42 900 E T4u, T4w, T4t, T4v;
Chris@42 901 T4u = T4n - T4m;
Chris@42 902 T4w = T4q + T4r;
Chris@42 903 T4t = W[32];
Chris@42 904 T4v = W[33];
Chris@42 905 Ip[WS(rs, 8)] = FNMS(T4v, T4w, T4t * T4u);
Chris@42 906 Im[WS(rs, 8)] = FMA(T4t, T4w, T4v * T4u);
Chris@42 907 }
Chris@42 908 {
Chris@42 909 E T4i, T4k, T4h, T4j;
Chris@42 910 T4i = T43 + T3W;
Chris@42 911 T4k = T4c - T4f;
Chris@42 912 T4h = W[16];
Chris@42 913 T4j = W[17];
Chris@42 914 Ip[WS(rs, 4)] = FNMS(T4j, T4k, T4h * T4i);
Chris@42 915 Im[WS(rs, 4)] = FMA(T4h, T4k, T4j * T4i);
Chris@42 916 }
Chris@42 917 {
Chris@42 918 E T4o, T4s, T4l, T4p;
Chris@42 919 T4o = T4m + T4n;
Chris@42 920 T4s = T4q - T4r;
Chris@42 921 T4l = W[24];
Chris@42 922 T4p = W[25];
Chris@42 923 Ip[WS(rs, 6)] = FNMS(T4p, T4s, T4l * T4o);
Chris@42 924 Im[WS(rs, 6)] = FMA(T4l, T4s, T4p * T4o);
Chris@42 925 }
Chris@42 926 }
Chris@42 927 {
Chris@42 928 E T1j, T1o, T1M, T1J, T1B, T1N, TO, T1I;
Chris@42 929 T1j = FNMS(KP951056516, T1i, KP587785252 * T13);
Chris@42 930 T1o = FNMS(KP951056516, T1n, KP587785252 * T1m);
Chris@42 931 T1M = FMA(KP951056516, T1m, KP587785252 * T1n);
Chris@42 932 T1J = FMA(KP951056516, T13, KP587785252 * T1i);
Chris@42 933 {
Chris@42 934 E T1z, T1A, TM, TN;
Chris@42 935 T1z = FNMS(KP250000000, T1y, T1v);
Chris@42 936 T1A = KP559016994 * (T1w - T1x);
Chris@42 937 T1B = T1z - T1A;
Chris@42 938 T1N = T1A + T1z;
Chris@42 939 TM = FNMS(KP250000000, TL, TE);
Chris@42 940 TN = KP559016994 * (TH - TK);
Chris@42 941 TO = TM - TN;
Chris@42 942 T1I = TN + TM;
Chris@42 943 }
Chris@42 944 {
Chris@42 945 E T1k, T1C, TD, T1l;
Chris@42 946 T1k = TO - T1j;
Chris@42 947 T1C = T1o + T1B;
Chris@42 948 TD = W[2];
Chris@42 949 T1l = W[3];
Chris@42 950 Rp[WS(rs, 1)] = FNMS(T1l, T1C, TD * T1k);
Chris@42 951 Rm[WS(rs, 1)] = FMA(T1l, T1k, TD * T1C);
Chris@42 952 }
Chris@42 953 {
Chris@42 954 E T1Q, T1S, T1P, T1R;
Chris@42 955 T1Q = T1I + T1J;
Chris@42 956 T1S = T1N - T1M;
Chris@42 957 T1P = W[26];
Chris@42 958 T1R = W[27];
Chris@42 959 Rp[WS(rs, 7)] = FNMS(T1R, T1S, T1P * T1Q);
Chris@42 960 Rm[WS(rs, 7)] = FMA(T1R, T1Q, T1P * T1S);
Chris@42 961 }
Chris@42 962 {
Chris@42 963 E T1E, T1G, T1D, T1F;
Chris@42 964 T1E = TO + T1j;
Chris@42 965 T1G = T1B - T1o;
Chris@42 966 T1D = W[34];
Chris@42 967 T1F = W[35];
Chris@42 968 Rp[WS(rs, 9)] = FNMS(T1F, T1G, T1D * T1E);
Chris@42 969 Rm[WS(rs, 9)] = FMA(T1F, T1E, T1D * T1G);
Chris@42 970 }
Chris@42 971 {
Chris@42 972 E T1K, T1O, T1H, T1L;
Chris@42 973 T1K = T1I - T1J;
Chris@42 974 T1O = T1M + T1N;
Chris@42 975 T1H = W[10];
Chris@42 976 T1L = W[11];
Chris@42 977 Rp[WS(rs, 3)] = FNMS(T1L, T1O, T1H * T1K);
Chris@42 978 Rm[WS(rs, 3)] = FMA(T1L, T1K, T1H * T1O);
Chris@42 979 }
Chris@42 980 }
Chris@42 981 {
Chris@42 982 E T2Q, T3p, T3B, T3x, T3m, T3A, T3b, T3w;
Chris@42 983 T2Q = FNMS(KP951056516, T2P, KP587785252 * T2I);
Chris@42 984 T3p = FNMS(KP951056516, T3o, KP587785252 * T3n);
Chris@42 985 T3B = FMA(KP951056516, T3n, KP587785252 * T3o);
Chris@42 986 T3x = FMA(KP951056516, T2I, KP587785252 * T2P);
Chris@42 987 {
Chris@42 988 E T3k, T3l, T39, T3a;
Chris@42 989 T3k = FNMS(KP250000000, T3j, T3g);
Chris@42 990 T3l = KP559016994 * (T3h - T3i);
Chris@42 991 T3m = T3k - T3l;
Chris@42 992 T3A = T3l + T3k;
Chris@42 993 T39 = FNMS(KP250000000, T38, T2T);
Chris@42 994 T3a = KP559016994 * (T30 - T37);
Chris@42 995 T3b = T39 - T3a;
Chris@42 996 T3w = T3a + T39;
Chris@42 997 }
Chris@42 998 {
Chris@42 999 E T3c, T3q, T2B, T3d;
Chris@42 1000 T3c = T2Q + T3b;
Chris@42 1001 T3q = T3m - T3p;
Chris@42 1002 T2B = W[4];
Chris@42 1003 T3d = W[5];
Chris@42 1004 Ip[WS(rs, 1)] = FNMS(T3d, T3q, T2B * T3c);
Chris@42 1005 Im[WS(rs, 1)] = FMA(T2B, T3q, T3d * T3c);
Chris@42 1006 }
Chris@42 1007 {
Chris@42 1008 E T3E, T3G, T3D, T3F;
Chris@42 1009 T3E = T3x + T3w;
Chris@42 1010 T3G = T3A - T3B;
Chris@42 1011 T3D = W[36];
Chris@42 1012 T3F = W[37];
Chris@42 1013 Ip[WS(rs, 9)] = FNMS(T3F, T3G, T3D * T3E);
Chris@42 1014 Im[WS(rs, 9)] = FMA(T3D, T3G, T3F * T3E);
Chris@42 1015 }
Chris@42 1016 {
Chris@42 1017 E T3s, T3u, T3r, T3t;
Chris@42 1018 T3s = T3b - T2Q;
Chris@42 1019 T3u = T3m + T3p;
Chris@42 1020 T3r = W[12];
Chris@42 1021 T3t = W[13];
Chris@42 1022 Ip[WS(rs, 3)] = FNMS(T3t, T3u, T3r * T3s);
Chris@42 1023 Im[WS(rs, 3)] = FMA(T3r, T3u, T3t * T3s);
Chris@42 1024 }
Chris@42 1025 {
Chris@42 1026 E T3y, T3C, T3v, T3z;
Chris@42 1027 T3y = T3w - T3x;
Chris@42 1028 T3C = T3A + T3B;
Chris@42 1029 T3v = W[20];
Chris@42 1030 T3z = W[21];
Chris@42 1031 Ip[WS(rs, 5)] = FNMS(T3z, T3C, T3v * T3y);
Chris@42 1032 Im[WS(rs, 5)] = FMA(T3v, T3C, T3z * T3y);
Chris@42 1033 }
Chris@42 1034 }
Chris@42 1035 }
Chris@42 1036 }
Chris@42 1037 }
Chris@42 1038
Chris@42 1039 static const tw_instr twinstr[] = {
Chris@42 1040 {TW_FULL, 1, 20},
Chris@42 1041 {TW_NEXT, 1, 0}
Chris@42 1042 };
Chris@42 1043
Chris@42 1044 static const hc2c_desc desc = { 20, "hc2cb_20", twinstr, &GENUS, {184, 62, 62, 0} };
Chris@42 1045
Chris@42 1046 void X(codelet_hc2cb_20) (planner *p) {
Chris@42 1047 X(khc2c_register) (p, hc2cb_20, &desc, HC2C_VIA_RDFT);
Chris@42 1048 }
Chris@42 1049 #endif /* HAVE_FMA */