annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cb2_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:50 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hc2cb2_20 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 276 FP additions, 198 FP multiplications,
Chris@42 32 * (or, 136 additions, 58 multiplications, 140 fused multiply/add),
Chris@42 33 * 160 stack variables, 4 constants, and 80 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cb2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 46 E T1S, T1O, T1s, TI, T24, T1Y, T2g, T2k, TS, TR, T1I, T26, T1o, T20, T1F;
Chris@42 47 E T25, TT, T1Z;
Chris@42 48 {
Chris@42 49 E TD, TH, TE, T1L, T1N, T1X, TG, T1V, T2Y, T2b, T29, T2s, T36, T3e, T31;
Chris@42 50 E T2o, T3b, T5b, T2c, T2U, T4y, T4u, T2f, T5g, T47, T5p, T4b, T5l;
Chris@42 51 {
Chris@42 52 E T1r, TF, T2T, T1M, T1R, T2X, T2r, T4x;
Chris@42 53 TD = W[0];
Chris@42 54 TH = W[3];
Chris@42 55 TE = W[2];
Chris@42 56 T1L = W[6];
Chris@42 57 T1N = W[7];
Chris@42 58 T1r = TD * TH;
Chris@42 59 TF = TD * TE;
Chris@42 60 T2T = TE * T1L;
Chris@42 61 T1M = TD * T1L;
Chris@42 62 T1R = TD * T1N;
Chris@42 63 T2X = TE * T1N;
Chris@42 64 T1X = W[5];
Chris@42 65 TG = W[1];
Chris@42 66 T1V = W[4];
Chris@42 67 T2Y = FNMS(TH, T1L, T2X);
Chris@42 68 T2r = TD * T1X;
Chris@42 69 {
Chris@42 70 E T23, T2n, T1W, T2a;
Chris@42 71 T23 = TE * T1X;
Chris@42 72 T1S = FNMS(TG, T1L, T1R);
Chris@42 73 T1O = FMA(TG, T1N, T1M);
Chris@42 74 T2b = FMA(TG, TE, T1r);
Chris@42 75 T1s = FNMS(TG, TE, T1r);
Chris@42 76 T29 = FNMS(TG, TH, TF);
Chris@42 77 TI = FMA(TG, TH, TF);
Chris@42 78 T2n = TD * T1V;
Chris@42 79 T1W = TE * T1V;
Chris@42 80 T2s = FMA(TG, T1V, T2r);
Chris@42 81 T36 = FNMS(TG, T1V, T2r);
Chris@42 82 T3e = FMA(TH, T1V, T23);
Chris@42 83 T24 = FNMS(TH, T1V, T23);
Chris@42 84 T2a = T29 * T1V;
Chris@42 85 T31 = FMA(TG, T1X, T2n);
Chris@42 86 T2o = FNMS(TG, T1X, T2n);
Chris@42 87 T3b = FNMS(TH, T1X, T1W);
Chris@42 88 T1Y = FMA(TH, T1X, T1W);
Chris@42 89 T5b = FNMS(T2b, T1X, T2a);
Chris@42 90 T2c = FMA(T2b, T1X, T2a);
Chris@42 91 T2U = FMA(TH, T1N, T2T);
Chris@42 92 }
Chris@42 93 T4x = T29 * T1N;
Chris@42 94 {
Chris@42 95 E T4t, T2d, T2j, T2e;
Chris@42 96 T4t = T29 * T1L;
Chris@42 97 T2e = T29 * T1X;
Chris@42 98 T4y = FNMS(T2b, T1L, T4x);
Chris@42 99 T4u = FMA(T2b, T1N, T4t);
Chris@42 100 T2f = FNMS(T2b, T1V, T2e);
Chris@42 101 T5g = FMA(T2b, T1V, T2e);
Chris@42 102 T2d = T2c * T1L;
Chris@42 103 T2j = T2c * T1N;
Chris@42 104 T47 = TI * T1V;
Chris@42 105 T2g = FMA(T2f, T1N, T2d);
Chris@42 106 T2k = FNMS(T2f, T1L, T2j);
Chris@42 107 T5p = TI * T1N;
Chris@42 108 T4b = TI * T1X;
Chris@42 109 T5l = TI * T1L;
Chris@42 110 }
Chris@42 111 }
Chris@42 112 {
Chris@42 113 E T4f, T48, T4c, T4k, T5m, T5q, T3V, T4V, TJ, T7, T3j, T4B, T2H, T1z, T3q;
Chris@42 114 E T43, T1n, T52, T42, T3x, T53, T2D, T18, T2A, T1H, T4R, T4X, T4W, T4O, T1G;
Chris@42 115 E T2O, T3I, T2P, T3P, T2K, T2M, T1C, T1E, TC, T2w, T40, T3Y, T4K, T4I, TQ;
Chris@42 116 {
Chris@42 117 E T3h, T3, T1w, T3T, T1v, T3U, T6, T1x;
Chris@42 118 {
Chris@42 119 E T1t, T1u, T1, T2, T4, T5;
Chris@42 120 T1 = Rp[0];
Chris@42 121 T2 = Rm[WS(rs, 9)];
Chris@42 122 T1t = Ip[0];
Chris@42 123 T4f = FNMS(T1s, T1X, T47);
Chris@42 124 T48 = FMA(T1s, T1X, T47);
Chris@42 125 T4c = FNMS(T1s, T1V, T4b);
Chris@42 126 T4k = FMA(T1s, T1V, T4b);
Chris@42 127 T5m = FMA(T1s, T1N, T5l);
Chris@42 128 T5q = FNMS(T1s, T1L, T5p);
Chris@42 129 T3h = T1 - T2;
Chris@42 130 T3 = T1 + T2;
Chris@42 131 T1u = Im[WS(rs, 9)];
Chris@42 132 T4 = Rp[WS(rs, 5)];
Chris@42 133 T5 = Rm[WS(rs, 4)];
Chris@42 134 T1w = Ip[WS(rs, 5)];
Chris@42 135 T3T = T1t + T1u;
Chris@42 136 T1v = T1t - T1u;
Chris@42 137 T3U = T4 - T5;
Chris@42 138 T6 = T4 + T5;
Chris@42 139 T1x = Im[WS(rs, 4)];
Chris@42 140 }
Chris@42 141 {
Chris@42 142 E T3L, T4M, TK, Te, T3m, T4C, T2y, T1f, T3H, T4Q, TO, TA, T3w, T4G, T2C;
Chris@42 143 E T17, T3O, T4N, TL, Tl, T3p, T4D, T2z, T1m, T3r, Tp, TX, T3C, TW, T3D;
Chris@42 144 E Ts, TY;
Chris@42 145 {
Chris@42 146 E T3u, Tw, T14, T3G, T13, T3F, Tz, T15;
Chris@42 147 {
Chris@42 148 E T3k, Ta, T1c, T3J, T1b, T3K, Td, T1d;
Chris@42 149 {
Chris@42 150 E T19, T1a, Tb, Tc;
Chris@42 151 {
Chris@42 152 E T8, T3i, T1y, T9;
Chris@42 153 T8 = Rp[WS(rs, 4)];
Chris@42 154 T3V = T3T - T3U;
Chris@42 155 T4V = T3U + T3T;
Chris@42 156 TJ = T3 - T6;
Chris@42 157 T7 = T3 + T6;
Chris@42 158 T3i = T1w + T1x;
Chris@42 159 T1y = T1w - T1x;
Chris@42 160 T9 = Rm[WS(rs, 5)];
Chris@42 161 T19 = Ip[WS(rs, 4)];
Chris@42 162 T3j = T3h + T3i;
Chris@42 163 T4B = T3h - T3i;
Chris@42 164 T2H = T1v + T1y;
Chris@42 165 T1z = T1v - T1y;
Chris@42 166 T3k = T8 - T9;
Chris@42 167 Ta = T8 + T9;
Chris@42 168 T1a = Im[WS(rs, 5)];
Chris@42 169 }
Chris@42 170 Tb = Rp[WS(rs, 9)];
Chris@42 171 Tc = Rm[0];
Chris@42 172 T1c = Ip[WS(rs, 9)];
Chris@42 173 T3J = T19 + T1a;
Chris@42 174 T1b = T19 - T1a;
Chris@42 175 T3K = Tb - Tc;
Chris@42 176 Td = Tb + Tc;
Chris@42 177 T1d = Im[0];
Chris@42 178 }
Chris@42 179 {
Chris@42 180 E T11, T12, Tx, Ty;
Chris@42 181 {
Chris@42 182 E Tu, T3l, T1e, Tv;
Chris@42 183 Tu = Rm[WS(rs, 7)];
Chris@42 184 T3L = T3J - T3K;
Chris@42 185 T4M = T3K + T3J;
Chris@42 186 TK = Ta - Td;
Chris@42 187 Te = Ta + Td;
Chris@42 188 T3l = T1c + T1d;
Chris@42 189 T1e = T1c - T1d;
Chris@42 190 Tv = Rp[WS(rs, 2)];
Chris@42 191 T11 = Ip[WS(rs, 2)];
Chris@42 192 T3m = T3k + T3l;
Chris@42 193 T4C = T3k - T3l;
Chris@42 194 T2y = T1b + T1e;
Chris@42 195 T1f = T1b - T1e;
Chris@42 196 T3u = Tu - Tv;
Chris@42 197 Tw = Tu + Tv;
Chris@42 198 T12 = Im[WS(rs, 7)];
Chris@42 199 }
Chris@42 200 Tx = Rm[WS(rs, 2)];
Chris@42 201 Ty = Rp[WS(rs, 7)];
Chris@42 202 T14 = Ip[WS(rs, 7)];
Chris@42 203 T3G = T11 + T12;
Chris@42 204 T13 = T11 - T12;
Chris@42 205 T3F = Tx - Ty;
Chris@42 206 Tz = Tx + Ty;
Chris@42 207 T15 = Im[WS(rs, 2)];
Chris@42 208 }
Chris@42 209 }
Chris@42 210 {
Chris@42 211 E T3n, Th, T1j, T3N, T1i, T3M, Tk, T1k;
Chris@42 212 {
Chris@42 213 E T1g, T1h, Ti, Tj;
Chris@42 214 {
Chris@42 215 E Tf, T3v, T16, Tg;
Chris@42 216 Tf = Rm[WS(rs, 3)];
Chris@42 217 T3H = T3F + T3G;
Chris@42 218 T4Q = T3F - T3G;
Chris@42 219 TO = Tw - Tz;
Chris@42 220 TA = Tw + Tz;
Chris@42 221 T3v = T14 + T15;
Chris@42 222 T16 = T14 - T15;
Chris@42 223 Tg = Rp[WS(rs, 6)];
Chris@42 224 T1g = Ip[WS(rs, 6)];
Chris@42 225 T3w = T3u - T3v;
Chris@42 226 T4G = T3u + T3v;
Chris@42 227 T2C = T13 + T16;
Chris@42 228 T17 = T13 - T16;
Chris@42 229 T3n = Tf - Tg;
Chris@42 230 Th = Tf + Tg;
Chris@42 231 T1h = Im[WS(rs, 3)];
Chris@42 232 }
Chris@42 233 Ti = Rp[WS(rs, 1)];
Chris@42 234 Tj = Rm[WS(rs, 8)];
Chris@42 235 T1j = Ip[WS(rs, 1)];
Chris@42 236 T3N = T1g + T1h;
Chris@42 237 T1i = T1g - T1h;
Chris@42 238 T3M = Ti - Tj;
Chris@42 239 Tk = Ti + Tj;
Chris@42 240 T1k = Im[WS(rs, 8)];
Chris@42 241 }
Chris@42 242 {
Chris@42 243 E TU, TV, Tq, Tr;
Chris@42 244 {
Chris@42 245 E Tn, T3o, T1l, To;
Chris@42 246 Tn = Rp[WS(rs, 8)];
Chris@42 247 T3O = T3M + T3N;
Chris@42 248 T4N = T3M - T3N;
Chris@42 249 TL = Th - Tk;
Chris@42 250 Tl = Th + Tk;
Chris@42 251 T3o = T1j + T1k;
Chris@42 252 T1l = T1j - T1k;
Chris@42 253 To = Rm[WS(rs, 1)];
Chris@42 254 TU = Ip[WS(rs, 8)];
Chris@42 255 T3p = T3n + T3o;
Chris@42 256 T4D = T3n - T3o;
Chris@42 257 T2z = T1i + T1l;
Chris@42 258 T1m = T1i - T1l;
Chris@42 259 T3r = Tn - To;
Chris@42 260 Tp = Tn + To;
Chris@42 261 TV = Im[WS(rs, 1)];
Chris@42 262 }
Chris@42 263 Tq = Rm[WS(rs, 6)];
Chris@42 264 Tr = Rp[WS(rs, 3)];
Chris@42 265 TX = Ip[WS(rs, 3)];
Chris@42 266 T3C = TU + TV;
Chris@42 267 TW = TU - TV;
Chris@42 268 T3D = Tq - Tr;
Chris@42 269 Ts = Tq + Tr;
Chris@42 270 TY = Im[WS(rs, 6)];
Chris@42 271 }
Chris@42 272 }
Chris@42 273 }
Chris@42 274 {
Chris@42 275 E T3E, Tt, T1A, T4E, T4H, T2J, T1B, T2I, TM, TP;
Chris@42 276 {
Chris@42 277 E T4P, TN, T3s, TZ;
Chris@42 278 T3q = T3m + T3p;
Chris@42 279 T43 = T3m - T3p;
Chris@42 280 T3E = T3C - T3D;
Chris@42 281 T4P = T3D + T3C;
Chris@42 282 TN = Tp - Ts;
Chris@42 283 Tt = Tp + Ts;
Chris@42 284 T3s = TX + TY;
Chris@42 285 TZ = TX - TY;
Chris@42 286 T1n = T1f - T1m;
Chris@42 287 T1A = T1f + T1m;
Chris@42 288 T4E = T4C + T4D;
Chris@42 289 T52 = T4C - T4D;
Chris@42 290 {
Chris@42 291 E T3t, T4F, T2B, T10;
Chris@42 292 T3t = T3r - T3s;
Chris@42 293 T4F = T3r + T3s;
Chris@42 294 T2B = TW + TZ;
Chris@42 295 T10 = TW - TZ;
Chris@42 296 T42 = T3t - T3w;
Chris@42 297 T3x = T3t + T3w;
Chris@42 298 T4H = T4F + T4G;
Chris@42 299 T53 = T4F - T4G;
Chris@42 300 T2D = T2B - T2C;
Chris@42 301 T2J = T2B + T2C;
Chris@42 302 T1B = T10 + T17;
Chris@42 303 T18 = T10 - T17;
Chris@42 304 T2A = T2y - T2z;
Chris@42 305 T2I = T2y + T2z;
Chris@42 306 TM = TK + TL;
Chris@42 307 T1H = TK - TL;
Chris@42 308 }
Chris@42 309 T4R = T4P - T4Q;
Chris@42 310 T4X = T4P + T4Q;
Chris@42 311 T4W = T4M + T4N;
Chris@42 312 T4O = T4M - T4N;
Chris@42 313 T1G = TN - TO;
Chris@42 314 TP = TN + TO;
Chris@42 315 }
Chris@42 316 {
Chris@42 317 E Tm, T3X, TB, T3W;
Chris@42 318 Tm = Te + Tl;
Chris@42 319 T2O = Te - Tl;
Chris@42 320 T3I = T3E + T3H;
Chris@42 321 T3X = T3E - T3H;
Chris@42 322 TB = Tt + TA;
Chris@42 323 T2P = Tt - TA;
Chris@42 324 T3P = T3L + T3O;
Chris@42 325 T3W = T3L - T3O;
Chris@42 326 T2K = T2I + T2J;
Chris@42 327 T2M = T2I - T2J;
Chris@42 328 T1C = T1A + T1B;
Chris@42 329 T1E = T1A - T1B;
Chris@42 330 TC = Tm + TB;
Chris@42 331 T2w = Tm - TB;
Chris@42 332 T40 = T3W - T3X;
Chris@42 333 T3Y = T3W + T3X;
Chris@42 334 T4K = T4E - T4H;
Chris@42 335 T4I = T4E + T4H;
Chris@42 336 TS = TM - TP;
Chris@42 337 TQ = TM + TP;
Chris@42 338 }
Chris@42 339 }
Chris@42 340 }
Chris@42 341 }
Chris@42 342 {
Chris@42 343 E T3A, T3y, T50, T1D, T2t, T2p, T4J, T5t, T5v, T4Z, T4Y;
Chris@42 344 Rp[0] = T7 + TC;
Chris@42 345 T3A = T3q - T3x;
Chris@42 346 T3y = T3q + T3x;
Chris@42 347 T50 = T4W - T4X;
Chris@42 348 T4Y = T4W + T4X;
Chris@42 349 Rm[0] = T2H + T2K;
Chris@42 350 T1D = FNMS(KP250000000, T1C, T1z);
Chris@42 351 T2t = T1z + T1C;
Chris@42 352 T2p = TJ + TQ;
Chris@42 353 TR = FNMS(KP250000000, TQ, TJ);
Chris@42 354 T4J = FNMS(KP250000000, T4I, T4B);
Chris@42 355 T5t = T4B + T4I;
Chris@42 356 T5v = T4V + T4Y;
Chris@42 357 T4Z = FNMS(KP250000000, T4Y, T4V);
Chris@42 358 {
Chris@42 359 E T4m, T44, T4i, T4p, T49, T3R, T4j, T4a, T3S, T4l, T41, T4q;
Chris@42 360 {
Chris@42 361 E T3z, T4v, T4w, T3Z, T4z;
Chris@42 362 T3z = FNMS(KP250000000, T3y, T3j);
Chris@42 363 T4v = T3j + T3y;
Chris@42 364 {
Chris@42 365 E T2u, T2q, T5u, T5w;
Chris@42 366 T2u = T2s * T2p;
Chris@42 367 T2q = T2o * T2p;
Chris@42 368 T5u = T2c * T5t;
Chris@42 369 T5w = T2c * T5v;
Chris@42 370 Rm[WS(rs, 5)] = FMA(T2o, T2t, T2u);
Chris@42 371 Rp[WS(rs, 5)] = FNMS(T2s, T2t, T2q);
Chris@42 372 Ip[WS(rs, 2)] = FNMS(T2f, T5v, T5u);
Chris@42 373 Im[WS(rs, 2)] = FMA(T2f, T5t, T5w);
Chris@42 374 T4w = T4u * T4v;
Chris@42 375 }
Chris@42 376 T3Z = FNMS(KP250000000, T3Y, T3V);
Chris@42 377 T4z = T3V + T3Y;
Chris@42 378 {
Chris@42 379 E T3Q, T4h, T4A, T4g, T3B;
Chris@42 380 T3Q = FNMS(KP618033988, T3P, T3I);
Chris@42 381 T4h = FMA(KP618033988, T3I, T3P);
Chris@42 382 Ip[WS(rs, 7)] = FNMS(T4y, T4z, T4w);
Chris@42 383 T4A = T4u * T4z;
Chris@42 384 T4m = FMA(KP618033988, T42, T43);
Chris@42 385 T44 = FNMS(KP618033988, T43, T42);
Chris@42 386 T4g = FMA(KP559016994, T3A, T3z);
Chris@42 387 T3B = FNMS(KP559016994, T3A, T3z);
Chris@42 388 Im[WS(rs, 7)] = FMA(T4y, T4v, T4A);
Chris@42 389 T4i = FNMS(KP951056516, T4h, T4g);
Chris@42 390 T4p = FMA(KP951056516, T4h, T4g);
Chris@42 391 T49 = FMA(KP951056516, T3Q, T3B);
Chris@42 392 T3R = FNMS(KP951056516, T3Q, T3B);
Chris@42 393 }
Chris@42 394 T4j = T4f * T4i;
Chris@42 395 T4a = T48 * T49;
Chris@42 396 T3S = TE * T3R;
Chris@42 397 T4l = FMA(KP559016994, T40, T3Z);
Chris@42 398 T41 = FNMS(KP559016994, T40, T3Z);
Chris@42 399 T4q = T1L * T4p;
Chris@42 400 }
Chris@42 401 {
Chris@42 402 E T5d, T4S, T54, T5i, T4L, T5c;
Chris@42 403 T5d = FNMS(KP618033988, T4O, T4R);
Chris@42 404 T4S = FMA(KP618033988, T4R, T4O);
Chris@42 405 {
Chris@42 406 E T4n, T4r, T4d, T45;
Chris@42 407 T4n = FMA(KP951056516, T4m, T4l);
Chris@42 408 T4r = FNMS(KP951056516, T4m, T4l);
Chris@42 409 T4d = FNMS(KP951056516, T44, T41);
Chris@42 410 T45 = FMA(KP951056516, T44, T41);
Chris@42 411 {
Chris@42 412 E T4o, T4s, T4e, T46;
Chris@42 413 T4o = T4f * T4n;
Chris@42 414 Ip[WS(rs, 5)] = FNMS(T4k, T4n, T4j);
Chris@42 415 T4s = T1L * T4r;
Chris@42 416 Ip[WS(rs, 9)] = FNMS(T1N, T4r, T4q);
Chris@42 417 T4e = T48 * T4d;
Chris@42 418 Ip[WS(rs, 3)] = FNMS(T4c, T4d, T4a);
Chris@42 419 T46 = TE * T45;
Chris@42 420 Ip[WS(rs, 1)] = FNMS(TH, T45, T3S);
Chris@42 421 Im[WS(rs, 5)] = FMA(T4k, T4i, T4o);
Chris@42 422 Im[WS(rs, 9)] = FMA(T1N, T4p, T4s);
Chris@42 423 Im[WS(rs, 3)] = FMA(T4c, T49, T4e);
Chris@42 424 Im[WS(rs, 1)] = FMA(TH, T3R, T46);
Chris@42 425 }
Chris@42 426 }
Chris@42 427 T54 = FMA(KP618033988, T53, T52);
Chris@42 428 T5i = FNMS(KP618033988, T52, T53);
Chris@42 429 T4L = FMA(KP559016994, T4K, T4J);
Chris@42 430 T5c = FNMS(KP559016994, T4K, T4J);
Chris@42 431 {
Chris@42 432 E T38, T2Q, T33, T2E, T2v, T37, T2N, T5h, T51, T2L, T2x, T32;
Chris@42 433 T38 = FNMS(KP618033988, T2O, T2P);
Chris@42 434 T2Q = FMA(KP618033988, T2P, T2O);
Chris@42 435 T5h = FNMS(KP559016994, T50, T4Z);
Chris@42 436 T51 = FMA(KP559016994, T50, T4Z);
Chris@42 437 {
Chris@42 438 E T5e, T5n, T57, T4T;
Chris@42 439 T5e = FNMS(KP951056516, T5d, T5c);
Chris@42 440 T5n = FMA(KP951056516, T5d, T5c);
Chris@42 441 T57 = FMA(KP951056516, T4S, T4L);
Chris@42 442 T4T = FNMS(KP951056516, T4S, T4L);
Chris@42 443 {
Chris@42 444 E T5j, T5r, T59, T55;
Chris@42 445 T5j = FMA(KP951056516, T5i, T5h);
Chris@42 446 T5r = FNMS(KP951056516, T5i, T5h);
Chris@42 447 T59 = FNMS(KP951056516, T54, T51);
Chris@42 448 T55 = FMA(KP951056516, T54, T51);
Chris@42 449 {
Chris@42 450 E T5f, T5o, T58, T4U;
Chris@42 451 T5f = T5b * T5e;
Chris@42 452 T5o = T5m * T5n;
Chris@42 453 T58 = T1V * T57;
Chris@42 454 T4U = TD * T4T;
Chris@42 455 {
Chris@42 456 E T5k, T5s, T5a, T56;
Chris@42 457 T5k = T5b * T5j;
Chris@42 458 T5s = T5m * T5r;
Chris@42 459 T5a = T1V * T59;
Chris@42 460 T56 = TD * T55;
Chris@42 461 Ip[WS(rs, 6)] = FNMS(T5g, T5j, T5f);
Chris@42 462 Ip[WS(rs, 8)] = FNMS(T5q, T5r, T5o);
Chris@42 463 Ip[WS(rs, 4)] = FNMS(T1X, T59, T58);
Chris@42 464 Ip[0] = FNMS(TG, T55, T4U);
Chris@42 465 Im[WS(rs, 6)] = FMA(T5g, T5e, T5k);
Chris@42 466 Im[WS(rs, 8)] = FMA(T5q, T5n, T5s);
Chris@42 467 Im[WS(rs, 4)] = FMA(T1X, T57, T5a);
Chris@42 468 Im[0] = FMA(TG, T4T, T56);
Chris@42 469 }
Chris@42 470 }
Chris@42 471 }
Chris@42 472 }
Chris@42 473 T2L = FNMS(KP250000000, T2K, T2H);
Chris@42 474 T33 = FNMS(KP618033988, T2A, T2D);
Chris@42 475 T2E = FMA(KP618033988, T2D, T2A);
Chris@42 476 T2v = FNMS(KP250000000, TC, T7);
Chris@42 477 T37 = FNMS(KP559016994, T2M, T2L);
Chris@42 478 T2N = FMA(KP559016994, T2M, T2L);
Chris@42 479 T1I = FNMS(KP618033988, T1H, T1G);
Chris@42 480 T26 = FMA(KP618033988, T1G, T1H);
Chris@42 481 T2x = FMA(KP559016994, T2w, T2v);
Chris@42 482 T32 = FNMS(KP559016994, T2w, T2v);
Chris@42 483 {
Chris@42 484 E T3f, T39, T2R, T2Z;
Chris@42 485 T3f = FNMS(KP951056516, T38, T37);
Chris@42 486 T39 = FMA(KP951056516, T38, T37);
Chris@42 487 T2R = FNMS(KP951056516, T2Q, T2N);
Chris@42 488 T2Z = FMA(KP951056516, T2Q, T2N);
Chris@42 489 {
Chris@42 490 E T3c, T34, T2F, T2V;
Chris@42 491 T3c = FMA(KP951056516, T33, T32);
Chris@42 492 T34 = FNMS(KP951056516, T33, T32);
Chris@42 493 T2F = FMA(KP951056516, T2E, T2x);
Chris@42 494 T2V = FNMS(KP951056516, T2E, T2x);
Chris@42 495 {
Chris@42 496 E T3a, T35, T3g, T3d;
Chris@42 497 T3a = T36 * T34;
Chris@42 498 T35 = T31 * T34;
Chris@42 499 T3g = T3e * T3c;
Chris@42 500 T3d = T3b * T3c;
Chris@42 501 {
Chris@42 502 E T30, T2W, T2S, T2G;
Chris@42 503 T30 = T2Y * T2V;
Chris@42 504 T2W = T2U * T2V;
Chris@42 505 T2S = T2b * T2F;
Chris@42 506 T2G = T29 * T2F;
Chris@42 507 Rm[WS(rs, 4)] = FMA(T31, T39, T3a);
Chris@42 508 Rp[WS(rs, 4)] = FNMS(T36, T39, T35);
Chris@42 509 Rm[WS(rs, 6)] = FMA(T3b, T3f, T3g);
Chris@42 510 Rp[WS(rs, 6)] = FNMS(T3e, T3f, T3d);
Chris@42 511 Rm[WS(rs, 8)] = FMA(T2U, T2Z, T30);
Chris@42 512 Rp[WS(rs, 8)] = FNMS(T2Y, T2Z, T2W);
Chris@42 513 Rm[WS(rs, 2)] = FMA(T29, T2R, T2S);
Chris@42 514 Rp[WS(rs, 2)] = FNMS(T2b, T2R, T2G);
Chris@42 515 }
Chris@42 516 }
Chris@42 517 }
Chris@42 518 }
Chris@42 519 T1o = FNMS(KP618033988, T1n, T18);
Chris@42 520 T20 = FMA(KP618033988, T18, T1n);
Chris@42 521 T1F = FNMS(KP559016994, T1E, T1D);
Chris@42 522 T25 = FMA(KP559016994, T1E, T1D);
Chris@42 523 }
Chris@42 524 }
Chris@42 525 }
Chris@42 526 }
Chris@42 527 }
Chris@42 528 }
Chris@42 529 TT = FNMS(KP559016994, TS, TR);
Chris@42 530 T1Z = FMA(KP559016994, TS, TR);
Chris@42 531 {
Chris@42 532 E T2l, T27, T1J, T1T;
Chris@42 533 T2l = FNMS(KP951056516, T26, T25);
Chris@42 534 T27 = FMA(KP951056516, T26, T25);
Chris@42 535 T1J = FNMS(KP951056516, T1I, T1F);
Chris@42 536 T1T = FMA(KP951056516, T1I, T1F);
Chris@42 537 {
Chris@42 538 E T2h, T21, T1p, T1P;
Chris@42 539 T2h = FMA(KP951056516, T20, T1Z);
Chris@42 540 T21 = FNMS(KP951056516, T20, T1Z);
Chris@42 541 T1p = FMA(KP951056516, T1o, TT);
Chris@42 542 T1P = FNMS(KP951056516, T1o, TT);
Chris@42 543 {
Chris@42 544 E T28, T22, T2m, T2i;
Chris@42 545 T28 = T24 * T21;
Chris@42 546 T22 = T1Y * T21;
Chris@42 547 T2m = T2k * T2h;
Chris@42 548 T2i = T2g * T2h;
Chris@42 549 {
Chris@42 550 E T1U, T1Q, T1K, T1q;
Chris@42 551 T1U = T1S * T1P;
Chris@42 552 T1Q = T1O * T1P;
Chris@42 553 T1K = T1s * T1p;
Chris@42 554 T1q = TI * T1p;
Chris@42 555 Rm[WS(rs, 3)] = FMA(T1Y, T27, T28);
Chris@42 556 Rp[WS(rs, 3)] = FNMS(T24, T27, T22);
Chris@42 557 Rm[WS(rs, 7)] = FMA(T2g, T2l, T2m);
Chris@42 558 Rp[WS(rs, 7)] = FNMS(T2k, T2l, T2i);
Chris@42 559 Rm[WS(rs, 9)] = FMA(T1O, T1T, T1U);
Chris@42 560 Rp[WS(rs, 9)] = FNMS(T1S, T1T, T1Q);
Chris@42 561 Rm[WS(rs, 1)] = FMA(TI, T1J, T1K);
Chris@42 562 Rp[WS(rs, 1)] = FNMS(T1s, T1J, T1q);
Chris@42 563 }
Chris@42 564 }
Chris@42 565 }
Chris@42 566 }
Chris@42 567 }
Chris@42 568 }
Chris@42 569 }
Chris@42 570
Chris@42 571 static const tw_instr twinstr[] = {
Chris@42 572 {TW_CEXP, 1, 1},
Chris@42 573 {TW_CEXP, 1, 3},
Chris@42 574 {TW_CEXP, 1, 9},
Chris@42 575 {TW_CEXP, 1, 19},
Chris@42 576 {TW_NEXT, 1, 0}
Chris@42 577 };
Chris@42 578
Chris@42 579 static const hc2c_desc desc = { 20, "hc2cb2_20", twinstr, &GENUS, {136, 58, 140, 0} };
Chris@42 580
Chris@42 581 void X(codelet_hc2cb2_20) (planner *p) {
Chris@42 582 X(khc2c_register) (p, hc2cb2_20, &desc, HC2C_VIA_RDFT);
Chris@42 583 }
Chris@42 584 #else /* HAVE_FMA */
Chris@42 585
Chris@42 586 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 20 -dif -name hc2cb2_20 -include hc2cb.h */
Chris@42 587
Chris@42 588 /*
Chris@42 589 * This function contains 276 FP additions, 164 FP multiplications,
Chris@42 590 * (or, 204 additions, 92 multiplications, 72 fused multiply/add),
Chris@42 591 * 137 stack variables, 4 constants, and 80 memory accesses
Chris@42 592 */
Chris@42 593 #include "hc2cb.h"
Chris@42 594
Chris@42 595 static void hc2cb2_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 596 {
Chris@42 597 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 598 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 599 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 600 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 601 {
Chris@42 602 INT m;
Chris@42 603 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 604 E TD, TG, TE, TH, TJ, T1t, T27, T25, T1T, T1R, T1V, T2j, T2Z, T21, T2X;
Chris@42 605 E T2T, T2n, T2P, T3V, T41, T3R, T3X, T29, T2c, T4H, T4L, T1L, T1M, T1N, T2d;
Chris@42 606 E T4R, T1P, T4P, T49, T2N, T2f, T47, T2L;
Chris@42 607 {
Chris@42 608 E T1U, T2l, T1Z, T2i, T1S, T2m, T20, T2h;
Chris@42 609 {
Chris@42 610 E TF, T1s, TI, T1r;
Chris@42 611 TD = W[0];
Chris@42 612 TG = W[1];
Chris@42 613 TE = W[2];
Chris@42 614 TH = W[3];
Chris@42 615 TF = TD * TE;
Chris@42 616 T1s = TG * TE;
Chris@42 617 TI = TG * TH;
Chris@42 618 T1r = TD * TH;
Chris@42 619 TJ = TF + TI;
Chris@42 620 T1t = T1r - T1s;
Chris@42 621 T27 = T1r + T1s;
Chris@42 622 T25 = TF - TI;
Chris@42 623 T1T = W[5];
Chris@42 624 T1U = TH * T1T;
Chris@42 625 T2l = TD * T1T;
Chris@42 626 T1Z = TE * T1T;
Chris@42 627 T2i = TG * T1T;
Chris@42 628 T1R = W[4];
Chris@42 629 T1S = TE * T1R;
Chris@42 630 T2m = TG * T1R;
Chris@42 631 T20 = TH * T1R;
Chris@42 632 T2h = TD * T1R;
Chris@42 633 }
Chris@42 634 T1V = T1S + T1U;
Chris@42 635 T2j = T2h - T2i;
Chris@42 636 T2Z = T1Z + T20;
Chris@42 637 T21 = T1Z - T20;
Chris@42 638 T2X = T1S - T1U;
Chris@42 639 T2T = T2l - T2m;
Chris@42 640 T2n = T2l + T2m;
Chris@42 641 T2P = T2h + T2i;
Chris@42 642 {
Chris@42 643 E T3T, T3U, T3P, T3Q;
Chris@42 644 T3T = TJ * T1T;
Chris@42 645 T3U = T1t * T1R;
Chris@42 646 T3V = T3T - T3U;
Chris@42 647 T41 = T3T + T3U;
Chris@42 648 T3P = TJ * T1R;
Chris@42 649 T3Q = T1t * T1T;
Chris@42 650 T3R = T3P + T3Q;
Chris@42 651 T3X = T3P - T3Q;
Chris@42 652 {
Chris@42 653 E T26, T28, T2a, T2b;
Chris@42 654 T26 = T25 * T1R;
Chris@42 655 T28 = T27 * T1T;
Chris@42 656 T29 = T26 + T28;
Chris@42 657 T2a = T25 * T1T;
Chris@42 658 T2b = T27 * T1R;
Chris@42 659 T2c = T2a - T2b;
Chris@42 660 T4H = T26 - T28;
Chris@42 661 T4L = T2a + T2b;
Chris@42 662 T1L = W[6];
Chris@42 663 T1M = W[7];
Chris@42 664 T1N = FMA(TD, T1L, TG * T1M);
Chris@42 665 T2d = FMA(T29, T1L, T2c * T1M);
Chris@42 666 T4R = FNMS(T1t, T1L, TJ * T1M);
Chris@42 667 T1P = FNMS(TG, T1L, TD * T1M);
Chris@42 668 T4P = FMA(TJ, T1L, T1t * T1M);
Chris@42 669 T49 = FNMS(T27, T1L, T25 * T1M);
Chris@42 670 T2N = FNMS(TH, T1L, TE * T1M);
Chris@42 671 T2f = FNMS(T2c, T1L, T29 * T1M);
Chris@42 672 T47 = FMA(T25, T1L, T27 * T1M);
Chris@42 673 T2L = FMA(TE, T1L, TH * T1M);
Chris@42 674 }
Chris@42 675 }
Chris@42 676 }
Chris@42 677 {
Chris@42 678 E T7, T4i, T4x, TK, T1D, T3i, T3E, T2D, T19, T3L, T3M, T1o, T2x, T4C, T4B;
Chris@42 679 E T2u, T1v, T4r, T4o, T1u, T2H, T37, T2I, T3e, T3p, T3w, T3x, Tm, TB, TC;
Chris@42 680 E T4u, T4v, T4y, T2A, T2B, T2E, T1E, T1F, T1G, T4d, T4g, T4j, T3F, T3G, T3H;
Chris@42 681 E TN, TQ, TR, T48, T4a;
Chris@42 682 {
Chris@42 683 E T3, T3g, T1z, T3C, T6, T3D, T1C, T3h;
Chris@42 684 {
Chris@42 685 E T1, T2, T1x, T1y;
Chris@42 686 T1 = Rp[0];
Chris@42 687 T2 = Rm[WS(rs, 9)];
Chris@42 688 T3 = T1 + T2;
Chris@42 689 T3g = T1 - T2;
Chris@42 690 T1x = Ip[0];
Chris@42 691 T1y = Im[WS(rs, 9)];
Chris@42 692 T1z = T1x - T1y;
Chris@42 693 T3C = T1x + T1y;
Chris@42 694 }
Chris@42 695 {
Chris@42 696 E T4, T5, T1A, T1B;
Chris@42 697 T4 = Rp[WS(rs, 5)];
Chris@42 698 T5 = Rm[WS(rs, 4)];
Chris@42 699 T6 = T4 + T5;
Chris@42 700 T3D = T4 - T5;
Chris@42 701 T1A = Ip[WS(rs, 5)];
Chris@42 702 T1B = Im[WS(rs, 4)];
Chris@42 703 T1C = T1A - T1B;
Chris@42 704 T3h = T1A + T1B;
Chris@42 705 }
Chris@42 706 T7 = T3 + T6;
Chris@42 707 T4i = T3g - T3h;
Chris@42 708 T4x = T3D + T3C;
Chris@42 709 TK = T3 - T6;
Chris@42 710 T1D = T1z - T1C;
Chris@42 711 T3i = T3g + T3h;
Chris@42 712 T3E = T3C - T3D;
Chris@42 713 T2D = T1z + T1C;
Chris@42 714 }
Chris@42 715 {
Chris@42 716 E Te, T4b, T4m, TL, T11, T33, T3l, T2s, TA, T4f, T4q, TP, T1n, T3d, T3v;
Chris@42 717 E T2w, Tl, T4c, T4n, TM, T18, T36, T3o, T2t, Tt, T4e, T4p, TO, T1g, T3a;
Chris@42 718 E T3s, T2v;
Chris@42 719 {
Chris@42 720 E Ta, T3j, TX, T31, Td, T32, T10, T3k;
Chris@42 721 {
Chris@42 722 E T8, T9, TV, TW;
Chris@42 723 T8 = Rp[WS(rs, 4)];
Chris@42 724 T9 = Rm[WS(rs, 5)];
Chris@42 725 Ta = T8 + T9;
Chris@42 726 T3j = T8 - T9;
Chris@42 727 TV = Ip[WS(rs, 4)];
Chris@42 728 TW = Im[WS(rs, 5)];
Chris@42 729 TX = TV - TW;
Chris@42 730 T31 = TV + TW;
Chris@42 731 }
Chris@42 732 {
Chris@42 733 E Tb, Tc, TY, TZ;
Chris@42 734 Tb = Rp[WS(rs, 9)];
Chris@42 735 Tc = Rm[0];
Chris@42 736 Td = Tb + Tc;
Chris@42 737 T32 = Tb - Tc;
Chris@42 738 TY = Ip[WS(rs, 9)];
Chris@42 739 TZ = Im[0];
Chris@42 740 T10 = TY - TZ;
Chris@42 741 T3k = TY + TZ;
Chris@42 742 }
Chris@42 743 Te = Ta + Td;
Chris@42 744 T4b = T3j - T3k;
Chris@42 745 T4m = T32 + T31;
Chris@42 746 TL = Ta - Td;
Chris@42 747 T11 = TX - T10;
Chris@42 748 T33 = T31 - T32;
Chris@42 749 T3l = T3j + T3k;
Chris@42 750 T2s = TX + T10;
Chris@42 751 }
Chris@42 752 {
Chris@42 753 E Tw, T3t, T1j, T3c, Tz, T3b, T1m, T3u;
Chris@42 754 {
Chris@42 755 E Tu, Tv, T1h, T1i;
Chris@42 756 Tu = Rm[WS(rs, 7)];
Chris@42 757 Tv = Rp[WS(rs, 2)];
Chris@42 758 Tw = Tu + Tv;
Chris@42 759 T3t = Tu - Tv;
Chris@42 760 T1h = Ip[WS(rs, 2)];
Chris@42 761 T1i = Im[WS(rs, 7)];
Chris@42 762 T1j = T1h - T1i;
Chris@42 763 T3c = T1h + T1i;
Chris@42 764 }
Chris@42 765 {
Chris@42 766 E Tx, Ty, T1k, T1l;
Chris@42 767 Tx = Rm[WS(rs, 2)];
Chris@42 768 Ty = Rp[WS(rs, 7)];
Chris@42 769 Tz = Tx + Ty;
Chris@42 770 T3b = Tx - Ty;
Chris@42 771 T1k = Ip[WS(rs, 7)];
Chris@42 772 T1l = Im[WS(rs, 2)];
Chris@42 773 T1m = T1k - T1l;
Chris@42 774 T3u = T1k + T1l;
Chris@42 775 }
Chris@42 776 TA = Tw + Tz;
Chris@42 777 T4f = T3t + T3u;
Chris@42 778 T4q = T3b - T3c;
Chris@42 779 TP = Tw - Tz;
Chris@42 780 T1n = T1j - T1m;
Chris@42 781 T3d = T3b + T3c;
Chris@42 782 T3v = T3t - T3u;
Chris@42 783 T2w = T1j + T1m;
Chris@42 784 }
Chris@42 785 {
Chris@42 786 E Th, T3m, T14, T35, Tk, T34, T17, T3n;
Chris@42 787 {
Chris@42 788 E Tf, Tg, T12, T13;
Chris@42 789 Tf = Rm[WS(rs, 3)];
Chris@42 790 Tg = Rp[WS(rs, 6)];
Chris@42 791 Th = Tf + Tg;
Chris@42 792 T3m = Tf - Tg;
Chris@42 793 T12 = Ip[WS(rs, 6)];
Chris@42 794 T13 = Im[WS(rs, 3)];
Chris@42 795 T14 = T12 - T13;
Chris@42 796 T35 = T12 + T13;
Chris@42 797 }
Chris@42 798 {
Chris@42 799 E Ti, Tj, T15, T16;
Chris@42 800 Ti = Rp[WS(rs, 1)];
Chris@42 801 Tj = Rm[WS(rs, 8)];
Chris@42 802 Tk = Ti + Tj;
Chris@42 803 T34 = Ti - Tj;
Chris@42 804 T15 = Ip[WS(rs, 1)];
Chris@42 805 T16 = Im[WS(rs, 8)];
Chris@42 806 T17 = T15 - T16;
Chris@42 807 T3n = T15 + T16;
Chris@42 808 }
Chris@42 809 Tl = Th + Tk;
Chris@42 810 T4c = T3m - T3n;
Chris@42 811 T4n = T34 - T35;
Chris@42 812 TM = Th - Tk;
Chris@42 813 T18 = T14 - T17;
Chris@42 814 T36 = T34 + T35;
Chris@42 815 T3o = T3m + T3n;
Chris@42 816 T2t = T14 + T17;
Chris@42 817 }
Chris@42 818 {
Chris@42 819 E Tp, T3q, T1c, T38, Ts, T39, T1f, T3r;
Chris@42 820 {
Chris@42 821 E Tn, To, T1a, T1b;
Chris@42 822 Tn = Rp[WS(rs, 8)];
Chris@42 823 To = Rm[WS(rs, 1)];
Chris@42 824 Tp = Tn + To;
Chris@42 825 T3q = Tn - To;
Chris@42 826 T1a = Ip[WS(rs, 8)];
Chris@42 827 T1b = Im[WS(rs, 1)];
Chris@42 828 T1c = T1a - T1b;
Chris@42 829 T38 = T1a + T1b;
Chris@42 830 }
Chris@42 831 {
Chris@42 832 E Tq, Tr, T1d, T1e;
Chris@42 833 Tq = Rm[WS(rs, 6)];
Chris@42 834 Tr = Rp[WS(rs, 3)];
Chris@42 835 Ts = Tq + Tr;
Chris@42 836 T39 = Tq - Tr;
Chris@42 837 T1d = Ip[WS(rs, 3)];
Chris@42 838 T1e = Im[WS(rs, 6)];
Chris@42 839 T1f = T1d - T1e;
Chris@42 840 T3r = T1d + T1e;
Chris@42 841 }
Chris@42 842 Tt = Tp + Ts;
Chris@42 843 T4e = T3q + T3r;
Chris@42 844 T4p = T39 + T38;
Chris@42 845 TO = Tp - Ts;
Chris@42 846 T1g = T1c - T1f;
Chris@42 847 T3a = T38 - T39;
Chris@42 848 T3s = T3q - T3r;
Chris@42 849 T2v = T1c + T1f;
Chris@42 850 }
Chris@42 851 T19 = T11 - T18;
Chris@42 852 T3L = T3l - T3o;
Chris@42 853 T3M = T3s - T3v;
Chris@42 854 T1o = T1g - T1n;
Chris@42 855 T2x = T2v - T2w;
Chris@42 856 T4C = T4e - T4f;
Chris@42 857 T4B = T4b - T4c;
Chris@42 858 T2u = T2s - T2t;
Chris@42 859 T1v = TO - TP;
Chris@42 860 T4r = T4p - T4q;
Chris@42 861 T4o = T4m - T4n;
Chris@42 862 T1u = TL - TM;
Chris@42 863 T2H = Te - Tl;
Chris@42 864 T37 = T33 + T36;
Chris@42 865 T2I = Tt - TA;
Chris@42 866 T3e = T3a + T3d;
Chris@42 867 T3p = T3l + T3o;
Chris@42 868 T3w = T3s + T3v;
Chris@42 869 T3x = T3p + T3w;
Chris@42 870 Tm = Te + Tl;
Chris@42 871 TB = Tt + TA;
Chris@42 872 TC = Tm + TB;
Chris@42 873 T4u = T4m + T4n;
Chris@42 874 T4v = T4p + T4q;
Chris@42 875 T4y = T4u + T4v;
Chris@42 876 T2A = T2s + T2t;
Chris@42 877 T2B = T2v + T2w;
Chris@42 878 T2E = T2A + T2B;
Chris@42 879 T1E = T11 + T18;
Chris@42 880 T1F = T1g + T1n;
Chris@42 881 T1G = T1E + T1F;
Chris@42 882 T4d = T4b + T4c;
Chris@42 883 T4g = T4e + T4f;
Chris@42 884 T4j = T4d + T4g;
Chris@42 885 T3F = T33 - T36;
Chris@42 886 T3G = T3a - T3d;
Chris@42 887 T3H = T3F + T3G;
Chris@42 888 TN = TL + TM;
Chris@42 889 TQ = TO + TP;
Chris@42 890 TR = TN + TQ;
Chris@42 891 }
Chris@42 892 Rp[0] = T7 + TC;
Chris@42 893 Rm[0] = T2D + T2E;
Chris@42 894 {
Chris@42 895 E T2k, T2o, T4T, T4U;
Chris@42 896 T2k = TK + TR;
Chris@42 897 T2o = T1D + T1G;
Chris@42 898 Rp[WS(rs, 5)] = FNMS(T2n, T2o, T2j * T2k);
Chris@42 899 Rm[WS(rs, 5)] = FMA(T2n, T2k, T2j * T2o);
Chris@42 900 T4T = T4i + T4j;
Chris@42 901 T4U = T4x + T4y;
Chris@42 902 Ip[WS(rs, 2)] = FNMS(T2c, T4U, T29 * T4T);
Chris@42 903 Im[WS(rs, 2)] = FMA(T29, T4U, T2c * T4T);
Chris@42 904 }
Chris@42 905 T48 = T3i + T3x;
Chris@42 906 T4a = T3E + T3H;
Chris@42 907 Ip[WS(rs, 7)] = FNMS(T49, T4a, T47 * T48);
Chris@42 908 Im[WS(rs, 7)] = FMA(T47, T4a, T49 * T48);
Chris@42 909 {
Chris@42 910 E T2y, T2J, T2V, T2R, T2G, T2U, T2r, T2Q;
Chris@42 911 T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
Chris@42 912 T2J = FMA(KP951056516, T2H, KP587785252 * T2I);
Chris@42 913 T2V = FNMS(KP951056516, T2I, KP587785252 * T2H);
Chris@42 914 T2R = FNMS(KP951056516, T2x, KP587785252 * T2u);
Chris@42 915 {
Chris@42 916 E T2C, T2F, T2p, T2q;
Chris@42 917 T2C = KP559016994 * (T2A - T2B);
Chris@42 918 T2F = FNMS(KP250000000, T2E, T2D);
Chris@42 919 T2G = T2C + T2F;
Chris@42 920 T2U = T2F - T2C;
Chris@42 921 T2p = KP559016994 * (Tm - TB);
Chris@42 922 T2q = FNMS(KP250000000, TC, T7);
Chris@42 923 T2r = T2p + T2q;
Chris@42 924 T2Q = T2q - T2p;
Chris@42 925 }
Chris@42 926 {
Chris@42 927 E T2z, T2K, T2Y, T30;
Chris@42 928 T2z = T2r + T2y;
Chris@42 929 T2K = T2G - T2J;
Chris@42 930 Rp[WS(rs, 2)] = FNMS(T27, T2K, T25 * T2z);
Chris@42 931 Rm[WS(rs, 2)] = FMA(T27, T2z, T25 * T2K);
Chris@42 932 T2Y = T2Q - T2R;
Chris@42 933 T30 = T2V + T2U;
Chris@42 934 Rp[WS(rs, 6)] = FNMS(T2Z, T30, T2X * T2Y);
Chris@42 935 Rm[WS(rs, 6)] = FMA(T2Z, T2Y, T2X * T30);
Chris@42 936 }
Chris@42 937 {
Chris@42 938 E T2M, T2O, T2S, T2W;
Chris@42 939 T2M = T2r - T2y;
Chris@42 940 T2O = T2J + T2G;
Chris@42 941 Rp[WS(rs, 8)] = FNMS(T2N, T2O, T2L * T2M);
Chris@42 942 Rm[WS(rs, 8)] = FMA(T2N, T2M, T2L * T2O);
Chris@42 943 T2S = T2Q + T2R;
Chris@42 944 T2W = T2U - T2V;
Chris@42 945 Rp[WS(rs, 4)] = FNMS(T2T, T2W, T2P * T2S);
Chris@42 946 Rm[WS(rs, 4)] = FMA(T2T, T2S, T2P * T2W);
Chris@42 947 }
Chris@42 948 }
Chris@42 949 {
Chris@42 950 E T4s, T4D, T4N, T4I, T4A, T4M, T4l, T4J;
Chris@42 951 T4s = FMA(KP951056516, T4o, KP587785252 * T4r);
Chris@42 952 T4D = FMA(KP951056516, T4B, KP587785252 * T4C);
Chris@42 953 T4N = FNMS(KP951056516, T4C, KP587785252 * T4B);
Chris@42 954 T4I = FNMS(KP951056516, T4r, KP587785252 * T4o);
Chris@42 955 {
Chris@42 956 E T4w, T4z, T4h, T4k;
Chris@42 957 T4w = KP559016994 * (T4u - T4v);
Chris@42 958 T4z = FNMS(KP250000000, T4y, T4x);
Chris@42 959 T4A = T4w + T4z;
Chris@42 960 T4M = T4z - T4w;
Chris@42 961 T4h = KP559016994 * (T4d - T4g);
Chris@42 962 T4k = FNMS(KP250000000, T4j, T4i);
Chris@42 963 T4l = T4h + T4k;
Chris@42 964 T4J = T4k - T4h;
Chris@42 965 }
Chris@42 966 {
Chris@42 967 E T4t, T4E, T4Q, T4S;
Chris@42 968 T4t = T4l - T4s;
Chris@42 969 T4E = T4A + T4D;
Chris@42 970 Ip[0] = FNMS(TG, T4E, TD * T4t);
Chris@42 971 Im[0] = FMA(TD, T4E, TG * T4t);
Chris@42 972 T4Q = T4J - T4I;
Chris@42 973 T4S = T4M + T4N;
Chris@42 974 Ip[WS(rs, 8)] = FNMS(T4R, T4S, T4P * T4Q);
Chris@42 975 Im[WS(rs, 8)] = FMA(T4P, T4S, T4R * T4Q);
Chris@42 976 }
Chris@42 977 {
Chris@42 978 E T4F, T4G, T4K, T4O;
Chris@42 979 T4F = T4s + T4l;
Chris@42 980 T4G = T4A - T4D;
Chris@42 981 Ip[WS(rs, 4)] = FNMS(T1T, T4G, T1R * T4F);
Chris@42 982 Im[WS(rs, 4)] = FMA(T1R, T4G, T1T * T4F);
Chris@42 983 T4K = T4I + T4J;
Chris@42 984 T4O = T4M - T4N;
Chris@42 985 Ip[WS(rs, 6)] = FNMS(T4L, T4O, T4H * T4K);
Chris@42 986 Im[WS(rs, 6)] = FMA(T4H, T4O, T4L * T4K);
Chris@42 987 }
Chris@42 988 }
Chris@42 989 {
Chris@42 990 E T1p, T1w, T22, T1X, T1J, T23, TU, T1W;
Chris@42 991 T1p = FNMS(KP951056516, T1o, KP587785252 * T19);
Chris@42 992 T1w = FNMS(KP951056516, T1v, KP587785252 * T1u);
Chris@42 993 T22 = FMA(KP951056516, T1u, KP587785252 * T1v);
Chris@42 994 T1X = FMA(KP951056516, T19, KP587785252 * T1o);
Chris@42 995 {
Chris@42 996 E T1H, T1I, TS, TT;
Chris@42 997 T1H = FNMS(KP250000000, T1G, T1D);
Chris@42 998 T1I = KP559016994 * (T1E - T1F);
Chris@42 999 T1J = T1H - T1I;
Chris@42 1000 T23 = T1I + T1H;
Chris@42 1001 TS = FNMS(KP250000000, TR, TK);
Chris@42 1002 TT = KP559016994 * (TN - TQ);
Chris@42 1003 TU = TS - TT;
Chris@42 1004 T1W = TT + TS;
Chris@42 1005 }
Chris@42 1006 {
Chris@42 1007 E T1q, T1K, T2e, T2g;
Chris@42 1008 T1q = TU - T1p;
Chris@42 1009 T1K = T1w + T1J;
Chris@42 1010 Rp[WS(rs, 1)] = FNMS(T1t, T1K, TJ * T1q);
Chris@42 1011 Rm[WS(rs, 1)] = FMA(T1t, T1q, TJ * T1K);
Chris@42 1012 T2e = T1W + T1X;
Chris@42 1013 T2g = T23 - T22;
Chris@42 1014 Rp[WS(rs, 7)] = FNMS(T2f, T2g, T2d * T2e);
Chris@42 1015 Rm[WS(rs, 7)] = FMA(T2f, T2e, T2d * T2g);
Chris@42 1016 }
Chris@42 1017 {
Chris@42 1018 E T1O, T1Q, T1Y, T24;
Chris@42 1019 T1O = TU + T1p;
Chris@42 1020 T1Q = T1J - T1w;
Chris@42 1021 Rp[WS(rs, 9)] = FNMS(T1P, T1Q, T1N * T1O);
Chris@42 1022 Rm[WS(rs, 9)] = FMA(T1P, T1O, T1N * T1Q);
Chris@42 1023 T1Y = T1W - T1X;
Chris@42 1024 T24 = T22 + T23;
Chris@42 1025 Rp[WS(rs, 3)] = FNMS(T21, T24, T1V * T1Y);
Chris@42 1026 Rm[WS(rs, 3)] = FMA(T21, T1Y, T1V * T24);
Chris@42 1027 }
Chris@42 1028 }
Chris@42 1029 {
Chris@42 1030 E T3f, T3N, T43, T3Z, T3K, T42, T3A, T3Y;
Chris@42 1031 T3f = FNMS(KP951056516, T3e, KP587785252 * T37);
Chris@42 1032 T3N = FNMS(KP951056516, T3M, KP587785252 * T3L);
Chris@42 1033 T43 = FMA(KP951056516, T3L, KP587785252 * T3M);
Chris@42 1034 T3Z = FMA(KP951056516, T37, KP587785252 * T3e);
Chris@42 1035 {
Chris@42 1036 E T3I, T3J, T3y, T3z;
Chris@42 1037 T3I = FNMS(KP250000000, T3H, T3E);
Chris@42 1038 T3J = KP559016994 * (T3F - T3G);
Chris@42 1039 T3K = T3I - T3J;
Chris@42 1040 T42 = T3J + T3I;
Chris@42 1041 T3y = FNMS(KP250000000, T3x, T3i);
Chris@42 1042 T3z = KP559016994 * (T3p - T3w);
Chris@42 1043 T3A = T3y - T3z;
Chris@42 1044 T3Y = T3z + T3y;
Chris@42 1045 }
Chris@42 1046 {
Chris@42 1047 E T3B, T3O, T45, T46;
Chris@42 1048 T3B = T3f + T3A;
Chris@42 1049 T3O = T3K - T3N;
Chris@42 1050 Ip[WS(rs, 1)] = FNMS(TH, T3O, TE * T3B);
Chris@42 1051 Im[WS(rs, 1)] = FMA(TE, T3O, TH * T3B);
Chris@42 1052 T45 = T3Z + T3Y;
Chris@42 1053 T46 = T42 - T43;
Chris@42 1054 Ip[WS(rs, 9)] = FNMS(T1M, T46, T1L * T45);
Chris@42 1055 Im[WS(rs, 9)] = FMA(T1L, T46, T1M * T45);
Chris@42 1056 }
Chris@42 1057 {
Chris@42 1058 E T3S, T3W, T40, T44;
Chris@42 1059 T3S = T3A - T3f;
Chris@42 1060 T3W = T3K + T3N;
Chris@42 1061 Ip[WS(rs, 3)] = FNMS(T3V, T3W, T3R * T3S);
Chris@42 1062 Im[WS(rs, 3)] = FMA(T3R, T3W, T3V * T3S);
Chris@42 1063 T40 = T3Y - T3Z;
Chris@42 1064 T44 = T42 + T43;
Chris@42 1065 Ip[WS(rs, 5)] = FNMS(T41, T44, T3X * T40);
Chris@42 1066 Im[WS(rs, 5)] = FMA(T3X, T44, T41 * T40);
Chris@42 1067 }
Chris@42 1068 }
Chris@42 1069 }
Chris@42 1070 }
Chris@42 1071 }
Chris@42 1072 }
Chris@42 1073
Chris@42 1074 static const tw_instr twinstr[] = {
Chris@42 1075 {TW_CEXP, 1, 1},
Chris@42 1076 {TW_CEXP, 1, 3},
Chris@42 1077 {TW_CEXP, 1, 9},
Chris@42 1078 {TW_CEXP, 1, 19},
Chris@42 1079 {TW_NEXT, 1, 0}
Chris@42 1080 };
Chris@42 1081
Chris@42 1082 static const hc2c_desc desc = { 20, "hc2cb2_20", twinstr, &GENUS, {204, 92, 72, 0} };
Chris@42 1083
Chris@42 1084 void X(codelet_hc2cb2_20) (planner *p) {
Chris@42 1085 X(khc2c_register) (p, hc2cb2_20, &desc, HC2C_VIA_RDFT);
Chris@42 1086 }
Chris@42 1087 #endif /* HAVE_FMA */