annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cfdft_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:42 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 142 FP additions, 92 FP multiplications,
Chris@42 32 * (or, 96 additions, 46 multiplications, 46 fused multiply/add),
Chris@42 33 * 71 stack variables, 2 constants, and 48 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT m;
Chris@42 43 for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
Chris@42 44 E T2z, T2M;
Chris@42 45 {
Chris@42 46 E To, T1E, T2H, T1m, T1W, Tl, T1J, T2i, T2K, T1B, T2I, T2e, T19, T2E, T2C;
Chris@42 47 E T27, T1M, Tz, T2B, T1f, T1O, TJ, TT, T1Q;
Chris@42 48 {
Chris@42 49 E T2b, T1s, T1A, T2d;
Chris@42 50 {
Chris@42 51 E T1u, T1z, T1v, T2c, T1i, Te, T1l, Tj, Tf, T1H, T4, T1o, T1, T1r, T9;
Chris@42 52 E T1n, T5;
Chris@42 53 {
Chris@42 54 E T1x, T1y, T1t, Tm, Tn;
Chris@42 55 Tm = Ip[0];
Chris@42 56 Tn = Im[0];
Chris@42 57 T1x = Rp[0];
Chris@42 58 T1y = Rm[0];
Chris@42 59 T1t = W[0];
Chris@42 60 T1u = Tm + Tn;
Chris@42 61 To = Tm - Tn;
Chris@42 62 {
Chris@42 63 E Th, Ti, Tb, Tc, Td;
Chris@42 64 Tc = Ip[WS(rs, 4)];
Chris@42 65 T1z = T1x - T1y;
Chris@42 66 T1E = T1x + T1y;
Chris@42 67 Td = Im[WS(rs, 4)];
Chris@42 68 T1v = T1t * T1u;
Chris@42 69 Th = Rp[WS(rs, 4)];
Chris@42 70 T2c = T1t * T1z;
Chris@42 71 T1i = Tc + Td;
Chris@42 72 Te = Tc - Td;
Chris@42 73 Ti = Rm[WS(rs, 4)];
Chris@42 74 Tb = W[14];
Chris@42 75 {
Chris@42 76 E T7, T8, T2, T3;
Chris@42 77 T2 = Ip[WS(rs, 2)];
Chris@42 78 T1l = Th - Ti;
Chris@42 79 Tj = Th + Ti;
Chris@42 80 Tf = Tb * Te;
Chris@42 81 T3 = Im[WS(rs, 2)];
Chris@42 82 T7 = Rp[WS(rs, 2)];
Chris@42 83 T1H = Tb * Tj;
Chris@42 84 T8 = Rm[WS(rs, 2)];
Chris@42 85 T4 = T2 - T3;
Chris@42 86 T1o = T2 + T3;
Chris@42 87 T1 = W[6];
Chris@42 88 T1r = T7 - T8;
Chris@42 89 T9 = T7 + T8;
Chris@42 90 T1n = W[8];
Chris@42 91 T5 = T1 * T4;
Chris@42 92 }
Chris@42 93 }
Chris@42 94 }
Chris@42 95 {
Chris@42 96 E T1F, T2a, T1p, T1h, T1k;
Chris@42 97 T1F = T1 * T9;
Chris@42 98 T2a = T1n * T1r;
Chris@42 99 T1p = T1n * T1o;
Chris@42 100 T1h = W[16];
Chris@42 101 T1k = W[17];
Chris@42 102 {
Chris@42 103 E T1G, Ta, Tk, T1I, T1q, T1w;
Chris@42 104 {
Chris@42 105 E T6, Tg, T2G, T1j;
Chris@42 106 T6 = W[7];
Chris@42 107 Tg = W[15];
Chris@42 108 T2G = T1h * T1l;
Chris@42 109 T1j = T1h * T1i;
Chris@42 110 T1G = FMA(T6, T4, T1F);
Chris@42 111 Ta = FNMS(T6, T9, T5);
Chris@42 112 T2H = FMA(T1k, T1i, T2G);
Chris@42 113 T1m = FNMS(T1k, T1l, T1j);
Chris@42 114 Tk = FNMS(Tg, Tj, Tf);
Chris@42 115 T1I = FMA(Tg, Te, T1H);
Chris@42 116 }
Chris@42 117 T1q = W[9];
Chris@42 118 T1w = W[1];
Chris@42 119 T1W = Ta - Tk;
Chris@42 120 Tl = Ta + Tk;
Chris@42 121 T1J = T1G + T1I;
Chris@42 122 T2i = T1I - T1G;
Chris@42 123 T2b = FMA(T1q, T1o, T2a);
Chris@42 124 T1s = FNMS(T1q, T1r, T1p);
Chris@42 125 T1A = FNMS(T1w, T1z, T1v);
Chris@42 126 T2d = FMA(T1w, T1u, T2c);
Chris@42 127 }
Chris@42 128 }
Chris@42 129 }
Chris@42 130 {
Chris@42 131 E T11, Tt, T10, TX, Ty, TZ, T23, T1b, TN, TS, T1e, T1P, TO, T17, TD;
Chris@42 132 E T16, T13, T14, TI, TA;
Chris@42 133 {
Chris@42 134 E Tw, Tx, Tr, Ts, TK;
Chris@42 135 Tr = Ip[WS(rs, 3)];
Chris@42 136 Ts = Im[WS(rs, 3)];
Chris@42 137 T2K = T1s - T1A;
Chris@42 138 T1B = T1s + T1A;
Chris@42 139 T2I = T2b + T2d;
Chris@42 140 T2e = T2b - T2d;
Chris@42 141 Tw = Rp[WS(rs, 3)];
Chris@42 142 T11 = Tr + Ts;
Chris@42 143 Tt = Tr - Ts;
Chris@42 144 Tx = Rm[WS(rs, 3)];
Chris@42 145 T10 = W[12];
Chris@42 146 TX = W[13];
Chris@42 147 {
Chris@42 148 E TL, TY, TM, TQ, TR;
Chris@42 149 TL = Ip[WS(rs, 1)];
Chris@42 150 Ty = Tw + Tx;
Chris@42 151 TY = Tx - Tw;
Chris@42 152 TM = Im[WS(rs, 1)];
Chris@42 153 TQ = Rp[WS(rs, 1)];
Chris@42 154 TR = Rm[WS(rs, 1)];
Chris@42 155 TZ = TX * TY;
Chris@42 156 T23 = T10 * TY;
Chris@42 157 T1b = TL + TM;
Chris@42 158 TN = TL - TM;
Chris@42 159 TS = TQ + TR;
Chris@42 160 T1e = TQ - TR;
Chris@42 161 }
Chris@42 162 TK = W[2];
Chris@42 163 {
Chris@42 164 E TG, TH, TB, TC;
Chris@42 165 TB = Ip[WS(rs, 5)];
Chris@42 166 TC = Im[WS(rs, 5)];
Chris@42 167 TG = Rp[WS(rs, 5)];
Chris@42 168 T1P = TK * TS;
Chris@42 169 TO = TK * TN;
Chris@42 170 T17 = TB + TC;
Chris@42 171 TD = TB - TC;
Chris@42 172 TH = Rm[WS(rs, 5)];
Chris@42 173 T16 = W[20];
Chris@42 174 T13 = W[21];
Chris@42 175 T14 = TH - TG;
Chris@42 176 TI = TG + TH;
Chris@42 177 TA = W[18];
Chris@42 178 }
Chris@42 179 }
Chris@42 180 {
Chris@42 181 E T12, T1N, TE, T18, T24, T26, T25, T15;
Chris@42 182 T12 = FMA(T10, T11, TZ);
Chris@42 183 T15 = T13 * T14;
Chris@42 184 T25 = T16 * T14;
Chris@42 185 T1N = TA * TI;
Chris@42 186 TE = TA * TD;
Chris@42 187 T18 = FMA(T16, T17, T15);
Chris@42 188 T24 = FNMS(TX, T11, T23);
Chris@42 189 T26 = FNMS(T13, T17, T25);
Chris@42 190 {
Chris@42 191 E Tv, T1L, Tu, Tq;
Chris@42 192 Tq = W[10];
Chris@42 193 T19 = T12 + T18;
Chris@42 194 T2E = T18 - T12;
Chris@42 195 Tv = W[11];
Chris@42 196 T2C = T24 + T26;
Chris@42 197 T27 = T24 - T26;
Chris@42 198 T1L = Tq * Ty;
Chris@42 199 Tu = Tq * Tt;
Chris@42 200 {
Chris@42 201 E T1d, T2A, T1c, T1a, TF, TP;
Chris@42 202 T1a = W[4];
Chris@42 203 T1d = W[5];
Chris@42 204 T1M = FMA(Tv, Tt, T1L);
Chris@42 205 Tz = FNMS(Tv, Ty, Tu);
Chris@42 206 T2A = T1a * T1e;
Chris@42 207 T1c = T1a * T1b;
Chris@42 208 TF = W[19];
Chris@42 209 TP = W[3];
Chris@42 210 T2B = FMA(T1d, T1b, T2A);
Chris@42 211 T1f = FNMS(T1d, T1e, T1c);
Chris@42 212 T1O = FMA(TF, TD, T1N);
Chris@42 213 TJ = FNMS(TF, TI, TE);
Chris@42 214 TT = FNMS(TP, TS, TO);
Chris@42 215 T1Q = FMA(TP, TN, T1P);
Chris@42 216 }
Chris@42 217 }
Chris@42 218 }
Chris@42 219 }
Chris@42 220 }
Chris@42 221 {
Chris@42 222 E T2h, T2D, T1Z, T2l, T2J, T22, T2k, T29, T30, T1U, T1V, T1Y, T2Z, T1T;
Chris@42 223 {
Chris@42 224 E T2Y, TW, T2V, T1D, T1K, T1S;
Chris@42 225 {
Chris@42 226 E Tp, T2W, TU, T1R, T2X, T1g, TV, T1C;
Chris@42 227 T2h = FNMS(KP500000000, Tl, To);
Chris@42 228 Tp = Tl + To;
Chris@42 229 T2W = T2C - T2B;
Chris@42 230 T2D = FMA(KP500000000, T2C, T2B);
Chris@42 231 T1Z = TJ - TT;
Chris@42 232 TU = TJ + TT;
Chris@42 233 T1R = T1O + T1Q;
Chris@42 234 T2l = T1Q - T1O;
Chris@42 235 T2J = FNMS(KP500000000, T2I, T2H);
Chris@42 236 T2X = T2H + T2I;
Chris@42 237 T1g = T19 + T1f;
Chris@42 238 T22 = FNMS(KP500000000, T19, T1f);
Chris@42 239 T2k = FNMS(KP500000000, TU, Tz);
Chris@42 240 TV = Tz + TU;
Chris@42 241 T1C = T1m + T1B;
Chris@42 242 T29 = FNMS(KP500000000, T1B, T1m);
Chris@42 243 T2Y = T2W - T2X;
Chris@42 244 T30 = T2W + T2X;
Chris@42 245 TW = Tp - TV;
Chris@42 246 T2V = TV + Tp;
Chris@42 247 T1U = T1g + T1C;
Chris@42 248 T1D = T1g - T1C;
Chris@42 249 T1V = FNMS(KP500000000, T1J, T1E);
Chris@42 250 T1K = T1E + T1J;
Chris@42 251 T1S = T1M + T1R;
Chris@42 252 T1Y = FNMS(KP500000000, T1R, T1M);
Chris@42 253 }
Chris@42 254 Ip[WS(rs, 3)] = KP500000000 * (TW + T1D);
Chris@42 255 Im[WS(rs, 2)] = KP500000000 * (T1D - TW);
Chris@42 256 Im[WS(rs, 5)] = KP500000000 * (T2Y - T2V);
Chris@42 257 T2Z = T1K - T1S;
Chris@42 258 T1T = T1K + T1S;
Chris@42 259 Ip[0] = KP500000000 * (T2V + T2Y);
Chris@42 260 }
Chris@42 261 {
Chris@42 262 E T2v, T1X, T2Q, T2F, T2R, T2L, T2w, T20, T2t, T28, T2p, T2j;
Chris@42 263 Rm[WS(rs, 2)] = KP500000000 * (T2Z + T30);
Chris@42 264 Rp[WS(rs, 3)] = KP500000000 * (T2Z - T30);
Chris@42 265 Rp[0] = KP500000000 * (T1T + T1U);
Chris@42 266 Rm[WS(rs, 5)] = KP500000000 * (T1T - T1U);
Chris@42 267 T2v = FMA(KP866025403, T1W, T1V);
Chris@42 268 T1X = FNMS(KP866025403, T1W, T1V);
Chris@42 269 T2Q = FMA(KP866025403, T2E, T2D);
Chris@42 270 T2F = FNMS(KP866025403, T2E, T2D);
Chris@42 271 T2R = FMA(KP866025403, T2K, T2J);
Chris@42 272 T2L = FNMS(KP866025403, T2K, T2J);
Chris@42 273 T2w = FMA(KP866025403, T1Z, T1Y);
Chris@42 274 T20 = FNMS(KP866025403, T1Z, T1Y);
Chris@42 275 T2t = FMA(KP866025403, T27, T22);
Chris@42 276 T28 = FNMS(KP866025403, T27, T22);
Chris@42 277 T2p = FMA(KP866025403, T2i, T2h);
Chris@42 278 T2j = FNMS(KP866025403, T2i, T2h);
Chris@42 279 {
Chris@42 280 E T2T, T2q, T2s, T2U;
Chris@42 281 {
Chris@42 282 E T21, T2f, T2S, T2n, T2P, T2m, T2o, T2g;
Chris@42 283 T2T = T1X - T20;
Chris@42 284 T21 = T1X + T20;
Chris@42 285 T2q = FMA(KP866025403, T2l, T2k);
Chris@42 286 T2m = FNMS(KP866025403, T2l, T2k);
Chris@42 287 T2s = FMA(KP866025403, T2e, T29);
Chris@42 288 T2f = FNMS(KP866025403, T2e, T29);
Chris@42 289 T2S = T2Q + T2R;
Chris@42 290 T2U = T2R - T2Q;
Chris@42 291 T2n = T2j - T2m;
Chris@42 292 T2P = T2m + T2j;
Chris@42 293 T2o = T2f - T28;
Chris@42 294 T2g = T28 + T2f;
Chris@42 295 Im[WS(rs, 3)] = KP500000000 * (T2S - T2P);
Chris@42 296 Ip[WS(rs, 2)] = KP500000000 * (T2P + T2S);
Chris@42 297 Rm[WS(rs, 3)] = KP500000000 * (T21 + T2g);
Chris@42 298 Rp[WS(rs, 2)] = KP500000000 * (T21 - T2g);
Chris@42 299 Ip[WS(rs, 5)] = KP500000000 * (T2n + T2o);
Chris@42 300 Im[0] = KP500000000 * (T2o - T2n);
Chris@42 301 }
Chris@42 302 {
Chris@42 303 E T2y, T2x, T2N, T2O, T2r, T2u;
Chris@42 304 T2z = T2q + T2p;
Chris@42 305 T2r = T2p - T2q;
Chris@42 306 T2u = T2s - T2t;
Chris@42 307 T2y = T2t + T2s;
Chris@42 308 T2x = T2v + T2w;
Chris@42 309 T2N = T2v - T2w;
Chris@42 310 Rp[WS(rs, 5)] = KP500000000 * (T2T + T2U);
Chris@42 311 Rm[0] = KP500000000 * (T2T - T2U);
Chris@42 312 Im[WS(rs, 4)] = KP500000000 * (T2u - T2r);
Chris@42 313 Ip[WS(rs, 1)] = KP500000000 * (T2r + T2u);
Chris@42 314 T2O = T2L - T2F;
Chris@42 315 T2M = T2F + T2L;
Chris@42 316 Rp[WS(rs, 1)] = KP500000000 * (T2N + T2O);
Chris@42 317 Rm[WS(rs, 4)] = KP500000000 * (T2N - T2O);
Chris@42 318 Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
Chris@42 319 Rm[WS(rs, 1)] = KP500000000 * (T2x - T2y);
Chris@42 320 }
Chris@42 321 }
Chris@42 322 }
Chris@42 323 }
Chris@42 324 }
Chris@42 325 Im[WS(rs, 1)] = -(KP500000000 * (T2z + T2M));
Chris@42 326 Ip[WS(rs, 4)] = KP500000000 * (T2z - T2M);
Chris@42 327 }
Chris@42 328 }
Chris@42 329 }
Chris@42 330
Chris@42 331 static const tw_instr twinstr[] = {
Chris@42 332 {TW_FULL, 1, 12},
Chris@42 333 {TW_NEXT, 1, 0}
Chris@42 334 };
Chris@42 335
Chris@42 336 static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, {96, 46, 46, 0} };
Chris@42 337
Chris@42 338 void X(codelet_hc2cfdft_12) (planner *p) {
Chris@42 339 X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT);
Chris@42 340 }
Chris@42 341 #else /* HAVE_FMA */
Chris@42 342
Chris@42 343 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include hc2cf.h */
Chris@42 344
Chris@42 345 /*
Chris@42 346 * This function contains 142 FP additions, 76 FP multiplications,
Chris@42 347 * (or, 112 additions, 46 multiplications, 30 fused multiply/add),
Chris@42 348 * 52 stack variables, 3 constants, and 48 memory accesses
Chris@42 349 */
Chris@42 350 #include "hc2cf.h"
Chris@42 351
Chris@42 352 static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 353 {
Chris@42 354 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 355 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 356 DK(KP433012701, +0.433012701892219323381861585376468091735701313);
Chris@42 357 {
Chris@42 358 INT m;
Chris@42 359 for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
Chris@42 360 E Tm, T1t, T1d, T2j, Tj, T1Y, T1w, T1G, T1q, T2q, T1U, T2k, Tw, T1y, T17;
Chris@42 361 E T2g, TP, T21, T1B, T1J, T12, T2u, T1P, T2h;
Chris@42 362 {
Chris@42 363 E Tk, Tl, T1k, T1m, T1n, T1o, T4, T1f, T8, T1h, Th, T1c, Td, T1a, T19;
Chris@42 364 E T1b;
Chris@42 365 {
Chris@42 366 E T2, T3, T6, T7;
Chris@42 367 Tk = Ip[0];
Chris@42 368 Tl = Im[0];
Chris@42 369 T1k = Tk + Tl;
Chris@42 370 T1m = Rp[0];
Chris@42 371 T1n = Rm[0];
Chris@42 372 T1o = T1m - T1n;
Chris@42 373 T2 = Ip[WS(rs, 2)];
Chris@42 374 T3 = Im[WS(rs, 2)];
Chris@42 375 T4 = T2 - T3;
Chris@42 376 T1f = T2 + T3;
Chris@42 377 T6 = Rp[WS(rs, 2)];
Chris@42 378 T7 = Rm[WS(rs, 2)];
Chris@42 379 T8 = T6 + T7;
Chris@42 380 T1h = T6 - T7;
Chris@42 381 {
Chris@42 382 E Tf, Tg, Tb, Tc;
Chris@42 383 Tf = Rp[WS(rs, 4)];
Chris@42 384 Tg = Rm[WS(rs, 4)];
Chris@42 385 Th = Tf + Tg;
Chris@42 386 T1c = Tf - Tg;
Chris@42 387 Tb = Ip[WS(rs, 4)];
Chris@42 388 Tc = Im[WS(rs, 4)];
Chris@42 389 Td = Tb - Tc;
Chris@42 390 T1a = Tb + Tc;
Chris@42 391 }
Chris@42 392 }
Chris@42 393 Tm = Tk - Tl;
Chris@42 394 T1t = T1m + T1n;
Chris@42 395 T19 = W[16];
Chris@42 396 T1b = W[17];
Chris@42 397 T1d = FNMS(T1b, T1c, T19 * T1a);
Chris@42 398 T2j = FMA(T19, T1c, T1b * T1a);
Chris@42 399 {
Chris@42 400 E T9, T1u, Ti, T1v;
Chris@42 401 {
Chris@42 402 E T1, T5, Ta, Te;
Chris@42 403 T1 = W[6];
Chris@42 404 T5 = W[7];
Chris@42 405 T9 = FNMS(T5, T8, T1 * T4);
Chris@42 406 T1u = FMA(T1, T8, T5 * T4);
Chris@42 407 Ta = W[14];
Chris@42 408 Te = W[15];
Chris@42 409 Ti = FNMS(Te, Th, Ta * Td);
Chris@42 410 T1v = FMA(Ta, Th, Te * Td);
Chris@42 411 }
Chris@42 412 Tj = T9 + Ti;
Chris@42 413 T1Y = KP433012701 * (T1v - T1u);
Chris@42 414 T1w = T1u + T1v;
Chris@42 415 T1G = KP433012701 * (T9 - Ti);
Chris@42 416 }
Chris@42 417 {
Chris@42 418 E T1i, T1S, T1p, T1T;
Chris@42 419 {
Chris@42 420 E T1e, T1g, T1j, T1l;
Chris@42 421 T1e = W[8];
Chris@42 422 T1g = W[9];
Chris@42 423 T1i = FNMS(T1g, T1h, T1e * T1f);
Chris@42 424 T1S = FMA(T1e, T1h, T1g * T1f);
Chris@42 425 T1j = W[0];
Chris@42 426 T1l = W[1];
Chris@42 427 T1p = FNMS(T1l, T1o, T1j * T1k);
Chris@42 428 T1T = FMA(T1j, T1o, T1l * T1k);
Chris@42 429 }
Chris@42 430 T1q = T1i + T1p;
Chris@42 431 T2q = KP433012701 * (T1i - T1p);
Chris@42 432 T1U = KP433012701 * (T1S - T1T);
Chris@42 433 T2k = T1S + T1T;
Chris@42 434 }
Chris@42 435 }
Chris@42 436 {
Chris@42 437 E Tr, TT, Tv, TV, TA, TY, TE, T10, TN, T14, TJ, T16;
Chris@42 438 {
Chris@42 439 E Tp, Tq, TC, TD;
Chris@42 440 Tp = Ip[WS(rs, 3)];
Chris@42 441 Tq = Im[WS(rs, 3)];
Chris@42 442 Tr = Tp - Tq;
Chris@42 443 TT = Tp + Tq;
Chris@42 444 {
Chris@42 445 E Tt, Tu, Ty, Tz;
Chris@42 446 Tt = Rp[WS(rs, 3)];
Chris@42 447 Tu = Rm[WS(rs, 3)];
Chris@42 448 Tv = Tt + Tu;
Chris@42 449 TV = Tt - Tu;
Chris@42 450 Ty = Ip[WS(rs, 5)];
Chris@42 451 Tz = Im[WS(rs, 5)];
Chris@42 452 TA = Ty - Tz;
Chris@42 453 TY = Ty + Tz;
Chris@42 454 }
Chris@42 455 TC = Rp[WS(rs, 5)];
Chris@42 456 TD = Rm[WS(rs, 5)];
Chris@42 457 TE = TC + TD;
Chris@42 458 T10 = TC - TD;
Chris@42 459 {
Chris@42 460 E TL, TM, TH, TI;
Chris@42 461 TL = Rp[WS(rs, 1)];
Chris@42 462 TM = Rm[WS(rs, 1)];
Chris@42 463 TN = TL + TM;
Chris@42 464 T14 = TM - TL;
Chris@42 465 TH = Ip[WS(rs, 1)];
Chris@42 466 TI = Im[WS(rs, 1)];
Chris@42 467 TJ = TH - TI;
Chris@42 468 T16 = TH + TI;
Chris@42 469 }
Chris@42 470 }
Chris@42 471 {
Chris@42 472 E To, Ts, T13, T15;
Chris@42 473 To = W[10];
Chris@42 474 Ts = W[11];
Chris@42 475 Tw = FNMS(Ts, Tv, To * Tr);
Chris@42 476 T1y = FMA(To, Tv, Ts * Tr);
Chris@42 477 T13 = W[5];
Chris@42 478 T15 = W[4];
Chris@42 479 T17 = FMA(T13, T14, T15 * T16);
Chris@42 480 T2g = FNMS(T13, T16, T15 * T14);
Chris@42 481 }
Chris@42 482 {
Chris@42 483 E TF, T1z, TO, T1A;
Chris@42 484 {
Chris@42 485 E Tx, TB, TG, TK;
Chris@42 486 Tx = W[18];
Chris@42 487 TB = W[19];
Chris@42 488 TF = FNMS(TB, TE, Tx * TA);
Chris@42 489 T1z = FMA(Tx, TE, TB * TA);
Chris@42 490 TG = W[2];
Chris@42 491 TK = W[3];
Chris@42 492 TO = FNMS(TK, TN, TG * TJ);
Chris@42 493 T1A = FMA(TG, TN, TK * TJ);
Chris@42 494 }
Chris@42 495 TP = TF + TO;
Chris@42 496 T21 = KP433012701 * (T1A - T1z);
Chris@42 497 T1B = T1z + T1A;
Chris@42 498 T1J = KP433012701 * (TF - TO);
Chris@42 499 }
Chris@42 500 {
Chris@42 501 E TW, T1O, T11, T1N;
Chris@42 502 {
Chris@42 503 E TS, TU, TX, TZ;
Chris@42 504 TS = W[12];
Chris@42 505 TU = W[13];
Chris@42 506 TW = FNMS(TU, TV, TS * TT);
Chris@42 507 T1O = FMA(TS, TV, TU * TT);
Chris@42 508 TX = W[20];
Chris@42 509 TZ = W[21];
Chris@42 510 T11 = FNMS(TZ, T10, TX * TY);
Chris@42 511 T1N = FMA(TX, T10, TZ * TY);
Chris@42 512 }
Chris@42 513 T12 = TW + T11;
Chris@42 514 T2u = KP433012701 * (T11 - TW);
Chris@42 515 T1P = KP433012701 * (T1N - T1O);
Chris@42 516 T2h = T1O + T1N;
Chris@42 517 }
Chris@42 518 }
Chris@42 519 {
Chris@42 520 E TR, T2f, T2m, T2o, T1s, T1E, T1D, T2n;
Chris@42 521 {
Chris@42 522 E Tn, TQ, T2i, T2l;
Chris@42 523 Tn = Tj + Tm;
Chris@42 524 TQ = Tw + TP;
Chris@42 525 TR = Tn - TQ;
Chris@42 526 T2f = TQ + Tn;
Chris@42 527 T2i = T2g - T2h;
Chris@42 528 T2l = T2j + T2k;
Chris@42 529 T2m = T2i - T2l;
Chris@42 530 T2o = T2i + T2l;
Chris@42 531 }
Chris@42 532 {
Chris@42 533 E T18, T1r, T1x, T1C;
Chris@42 534 T18 = T12 + T17;
Chris@42 535 T1r = T1d + T1q;
Chris@42 536 T1s = T18 - T1r;
Chris@42 537 T1E = T18 + T1r;
Chris@42 538 T1x = T1t + T1w;
Chris@42 539 T1C = T1y + T1B;
Chris@42 540 T1D = T1x + T1C;
Chris@42 541 T2n = T1x - T1C;
Chris@42 542 }
Chris@42 543 Ip[WS(rs, 3)] = KP500000000 * (TR + T1s);
Chris@42 544 Rp[WS(rs, 3)] = KP500000000 * (T2n - T2o);
Chris@42 545 Im[WS(rs, 2)] = KP500000000 * (T1s - TR);
Chris@42 546 Rm[WS(rs, 2)] = KP500000000 * (T2n + T2o);
Chris@42 547 Rm[WS(rs, 5)] = KP500000000 * (T1D - T1E);
Chris@42 548 Im[WS(rs, 5)] = KP500000000 * (T2m - T2f);
Chris@42 549 Rp[0] = KP500000000 * (T1D + T1E);
Chris@42 550 Ip[0] = KP500000000 * (T2f + T2m);
Chris@42 551 }
Chris@42 552 {
Chris@42 553 E T1H, T2b, T2s, T2B, T2v, T2A, T1K, T2c, T1Q, T29, T1Z, T25, T22, T26, T1V;
Chris@42 554 E T28;
Chris@42 555 {
Chris@42 556 E T1F, T2r, T2t, T1I;
Chris@42 557 T1F = FNMS(KP250000000, T1w, KP500000000 * T1t);
Chris@42 558 T1H = T1F - T1G;
Chris@42 559 T2b = T1F + T1G;
Chris@42 560 T2r = FNMS(KP500000000, T2j, KP250000000 * T2k);
Chris@42 561 T2s = T2q - T2r;
Chris@42 562 T2B = T2q + T2r;
Chris@42 563 T2t = FMA(KP250000000, T2h, KP500000000 * T2g);
Chris@42 564 T2v = T2t - T2u;
Chris@42 565 T2A = T2u + T2t;
Chris@42 566 T1I = FNMS(KP250000000, T1B, KP500000000 * T1y);
Chris@42 567 T1K = T1I - T1J;
Chris@42 568 T2c = T1I + T1J;
Chris@42 569 }
Chris@42 570 {
Chris@42 571 E T1M, T1X, T20, T1R;
Chris@42 572 T1M = FNMS(KP250000000, T12, KP500000000 * T17);
Chris@42 573 T1Q = T1M - T1P;
Chris@42 574 T29 = T1P + T1M;
Chris@42 575 T1X = FNMS(KP250000000, Tj, KP500000000 * Tm);
Chris@42 576 T1Z = T1X - T1Y;
Chris@42 577 T25 = T1Y + T1X;
Chris@42 578 T20 = FNMS(KP250000000, TP, KP500000000 * Tw);
Chris@42 579 T22 = T20 - T21;
Chris@42 580 T26 = T21 + T20;
Chris@42 581 T1R = FNMS(KP250000000, T1q, KP500000000 * T1d);
Chris@42 582 T1V = T1R - T1U;
Chris@42 583 T28 = T1R + T1U;
Chris@42 584 }
Chris@42 585 {
Chris@42 586 E T1L, T1W, T2p, T2w;
Chris@42 587 T1L = T1H + T1K;
Chris@42 588 T1W = T1Q + T1V;
Chris@42 589 Rp[WS(rs, 2)] = T1L - T1W;
Chris@42 590 Rm[WS(rs, 3)] = T1L + T1W;
Chris@42 591 T2p = T22 + T1Z;
Chris@42 592 T2w = T2s - T2v;
Chris@42 593 Ip[WS(rs, 2)] = T2p + T2w;
Chris@42 594 Im[WS(rs, 3)] = T2w - T2p;
Chris@42 595 }
Chris@42 596 {
Chris@42 597 E T23, T24, T2x, T2y;
Chris@42 598 T23 = T1Z - T22;
Chris@42 599 T24 = T1V - T1Q;
Chris@42 600 Ip[WS(rs, 5)] = T23 + T24;
Chris@42 601 Im[0] = T24 - T23;
Chris@42 602 T2x = T1H - T1K;
Chris@42 603 T2y = T2v + T2s;
Chris@42 604 Rm[0] = T2x - T2y;
Chris@42 605 Rp[WS(rs, 5)] = T2x + T2y;
Chris@42 606 }
Chris@42 607 {
Chris@42 608 E T27, T2a, T2z, T2C;
Chris@42 609 T27 = T25 - T26;
Chris@42 610 T2a = T28 - T29;
Chris@42 611 Ip[WS(rs, 1)] = T27 + T2a;
Chris@42 612 Im[WS(rs, 4)] = T2a - T27;
Chris@42 613 T2z = T2b - T2c;
Chris@42 614 T2C = T2A - T2B;
Chris@42 615 Rm[WS(rs, 4)] = T2z - T2C;
Chris@42 616 Rp[WS(rs, 1)] = T2z + T2C;
Chris@42 617 }
Chris@42 618 {
Chris@42 619 E T2d, T2e, T2D, T2E;
Chris@42 620 T2d = T2b + T2c;
Chris@42 621 T2e = T29 + T28;
Chris@42 622 Rm[WS(rs, 1)] = T2d - T2e;
Chris@42 623 Rp[WS(rs, 4)] = T2d + T2e;
Chris@42 624 T2D = T26 + T25;
Chris@42 625 T2E = T2A + T2B;
Chris@42 626 Ip[WS(rs, 4)] = T2D + T2E;
Chris@42 627 Im[WS(rs, 1)] = T2E - T2D;
Chris@42 628 }
Chris@42 629 }
Chris@42 630 }
Chris@42 631 }
Chris@42 632 }
Chris@42 633
Chris@42 634 static const tw_instr twinstr[] = {
Chris@42 635 {TW_FULL, 1, 12},
Chris@42 636 {TW_NEXT, 1, 0}
Chris@42 637 };
Chris@42 638
Chris@42 639 static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, {112, 46, 30, 0} };
Chris@42 640
Chris@42 641 void X(codelet_hc2cfdft_12) (planner *p) {
Chris@42 642 X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT);
Chris@42 643 }
Chris@42 644 #endif /* HAVE_FMA */