annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cbdft_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:56 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cbdft_10 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 122 FP additions, 72 FP multiplications,
Chris@42 32 * (or, 68 additions, 18 multiplications, 54 fused multiply/add),
Chris@42 33 * 95 stack variables, 4 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cbdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 46 E T2d, T2f;
Chris@42 47 {
Chris@42 48 E T1g, TQ, T1z, TZ, Tu, T23, T1p, T14, Tt, T27, T13, Tj, Tz, T1i, T18;
Chris@42 49 E TJ, TS, T19, Ty, TA;
Chris@42 50 {
Chris@42 51 E Tl, T3, T7, Tm, T6, Tr, TY, T1n, Th, T8, T1, T2;
Chris@42 52 T1 = Rp[0];
Chris@42 53 T2 = Rm[WS(rs, 4)];
Chris@42 54 {
Chris@42 55 E Te, Tp, Td, Tf, Tb, Tc;
Chris@42 56 Tb = Rp[WS(rs, 4)];
Chris@42 57 Tc = Rm[0];
Chris@42 58 Te = Rm[WS(rs, 3)];
Chris@42 59 Tl = T1 - T2;
Chris@42 60 T3 = T1 + T2;
Chris@42 61 Tp = Tb - Tc;
Chris@42 62 Td = Tb + Tc;
Chris@42 63 Tf = Rp[WS(rs, 1)];
Chris@42 64 {
Chris@42 65 E T4, T5, Tq, Tg;
Chris@42 66 T4 = Rp[WS(rs, 2)];
Chris@42 67 T5 = Rm[WS(rs, 2)];
Chris@42 68 T7 = Rm[WS(rs, 1)];
Chris@42 69 Tq = Te - Tf;
Chris@42 70 Tg = Te + Tf;
Chris@42 71 Tm = T4 - T5;
Chris@42 72 T6 = T4 + T5;
Chris@42 73 Tr = Tp + Tq;
Chris@42 74 TY = Tp - Tq;
Chris@42 75 T1n = Td - Tg;
Chris@42 76 Th = Td + Tg;
Chris@42 77 T8 = Rp[WS(rs, 3)];
Chris@42 78 }
Chris@42 79 }
Chris@42 80 {
Chris@42 81 E TO, Tn, T9, TP;
Chris@42 82 TO = Ip[0];
Chris@42 83 Tn = T7 - T8;
Chris@42 84 T9 = T7 + T8;
Chris@42 85 TP = Im[WS(rs, 4)];
Chris@42 86 {
Chris@42 87 E TG, TH, TF, T16, TD, TE, Ti;
Chris@42 88 TD = Ip[WS(rs, 4)];
Chris@42 89 {
Chris@42 90 E TX, To, T1o, Ta, Ts;
Chris@42 91 TX = Tm - Tn;
Chris@42 92 To = Tm + Tn;
Chris@42 93 T1o = T6 - T9;
Chris@42 94 Ta = T6 + T9;
Chris@42 95 T1g = TO - TP;
Chris@42 96 TQ = TO + TP;
Chris@42 97 T1z = FNMS(KP618033988, TX, TY);
Chris@42 98 TZ = FMA(KP618033988, TY, TX);
Chris@42 99 Ts = To + Tr;
Chris@42 100 Tu = To - Tr;
Chris@42 101 T23 = FMA(KP618033988, T1n, T1o);
Chris@42 102 T1p = FNMS(KP618033988, T1o, T1n);
Chris@42 103 Ti = Ta + Th;
Chris@42 104 T14 = Ta - Th;
Chris@42 105 Tt = FNMS(KP250000000, Ts, Tl);
Chris@42 106 T27 = Tl + Ts;
Chris@42 107 TE = Im[0];
Chris@42 108 }
Chris@42 109 T13 = FNMS(KP250000000, Ti, T3);
Chris@42 110 Tj = T3 + Ti;
Chris@42 111 TG = Im[WS(rs, 3)];
Chris@42 112 TH = Ip[WS(rs, 1)];
Chris@42 113 TF = TD + TE;
Chris@42 114 T16 = TD - TE;
Chris@42 115 {
Chris@42 116 E Tw, T17, TI, Tx;
Chris@42 117 Tw = Ip[WS(rs, 2)];
Chris@42 118 T17 = TH - TG;
Chris@42 119 TI = TG + TH;
Chris@42 120 Tx = Im[WS(rs, 2)];
Chris@42 121 Tz = Im[WS(rs, 1)];
Chris@42 122 T1i = T16 + T17;
Chris@42 123 T18 = T16 - T17;
Chris@42 124 TJ = TF + TI;
Chris@42 125 TS = TF - TI;
Chris@42 126 T19 = Tw - Tx;
Chris@42 127 Ty = Tw + Tx;
Chris@42 128 TA = Ip[WS(rs, 3)];
Chris@42 129 }
Chris@42 130 }
Chris@42 131 }
Chris@42 132 }
Chris@42 133 {
Chris@42 134 E T26, T2y, T2a, T28, T1q, T1K, T24, T2k, T10, T1Q, T1A, T2q, T29, Tk, TN;
Chris@42 135 E T2c, T1M, T1P, T2w, TM, T1O, T1S, T1s, T1x, T2m, T2p, T1w, T1C, T2o, T2s;
Chris@42 136 E T12, T1f, T1G, T1J, T1I, T1E, T1e, T1U, T1W, T21, T2g, T2j, T20, T2e, T2i;
Chris@42 137 E T2u, T1a, TB;
Chris@42 138 T1a = TA - Tz;
Chris@42 139 TB = Tz + TA;
Chris@42 140 {
Chris@42 141 E T1Y, T1c, T1u, T1t, T1N, TL, TK, Tv, T2n, T1v;
Chris@42 142 {
Chris@42 143 E T1l, TV, T1k, TU, T1b, T1h;
Chris@42 144 T26 = W[9];
Chris@42 145 T1b = T19 - T1a;
Chris@42 146 T1h = T19 + T1a;
Chris@42 147 {
Chris@42 148 E TC, TR, T1j, TT;
Chris@42 149 TC = Ty + TB;
Chris@42 150 TR = Ty - TB;
Chris@42 151 T1Y = FMA(KP618033988, T18, T1b);
Chris@42 152 T1c = FNMS(KP618033988, T1b, T18);
Chris@42 153 T1j = T1h + T1i;
Chris@42 154 T1l = T1h - T1i;
Chris@42 155 T1u = FNMS(KP618033988, TC, TJ);
Chris@42 156 TK = FMA(KP618033988, TJ, TC);
Chris@42 157 TT = TR + TS;
Chris@42 158 TV = TR - TS;
Chris@42 159 T2y = T1g + T1j;
Chris@42 160 T1k = FNMS(KP250000000, T1j, T1g);
Chris@42 161 T2a = TQ + TT;
Chris@42 162 TU = FNMS(KP250000000, TT, TQ);
Chris@42 163 T28 = T26 * T27;
Chris@42 164 }
Chris@42 165 {
Chris@42 166 E T22, T1m, T1y, TW;
Chris@42 167 T22 = FMA(KP559016994, T1l, T1k);
Chris@42 168 T1m = FNMS(KP559016994, T1l, T1k);
Chris@42 169 T1y = FNMS(KP559016994, TV, TU);
Chris@42 170 TW = FMA(KP559016994, TV, TU);
Chris@42 171 T1q = FNMS(KP951056516, T1p, T1m);
Chris@42 172 T1K = FMA(KP951056516, T1p, T1m);
Chris@42 173 T24 = FNMS(KP951056516, T23, T22);
Chris@42 174 T2k = FMA(KP951056516, T23, T22);
Chris@42 175 T10 = FMA(KP951056516, TZ, TW);
Chris@42 176 T1Q = FNMS(KP951056516, TZ, TW);
Chris@42 177 T1A = FMA(KP951056516, T1z, T1y);
Chris@42 178 T2q = FNMS(KP951056516, T1z, T1y);
Chris@42 179 T29 = W[8];
Chris@42 180 }
Chris@42 181 }
Chris@42 182 Tv = FMA(KP559016994, Tu, Tt);
Chris@42 183 T1t = FNMS(KP559016994, Tu, Tt);
Chris@42 184 Tk = W[1];
Chris@42 185 TN = W[0];
Chris@42 186 T2c = T29 * T27;
Chris@42 187 T1N = FMA(KP951056516, TK, Tv);
Chris@42 188 TL = FNMS(KP951056516, TK, Tv);
Chris@42 189 T1M = W[17];
Chris@42 190 T1P = W[16];
Chris@42 191 T2w = TN * TL;
Chris@42 192 TM = Tk * TL;
Chris@42 193 T1O = T1M * T1N;
Chris@42 194 T1S = T1P * T1N;
Chris@42 195 T2n = FMA(KP951056516, T1u, T1t);
Chris@42 196 T1v = FNMS(KP951056516, T1u, T1t);
Chris@42 197 T1s = W[5];
Chris@42 198 T1x = W[4];
Chris@42 199 T2m = W[13];
Chris@42 200 T2p = W[12];
Chris@42 201 T1w = T1s * T1v;
Chris@42 202 T1C = T1x * T1v;
Chris@42 203 T2o = T2m * T2n;
Chris@42 204 T2s = T2p * T2n;
Chris@42 205 {
Chris@42 206 E T1X, T1d, T1H, T15, T2h, T1Z;
Chris@42 207 T1X = FMA(KP559016994, T14, T13);
Chris@42 208 T15 = FNMS(KP559016994, T14, T13);
Chris@42 209 T12 = W[2];
Chris@42 210 T1f = W[3];
Chris@42 211 T1G = W[14];
Chris@42 212 T1d = FMA(KP951056516, T1c, T15);
Chris@42 213 T1H = FNMS(KP951056516, T1c, T15);
Chris@42 214 T1J = W[15];
Chris@42 215 T1I = T1G * T1H;
Chris@42 216 T1E = T1f * T1d;
Chris@42 217 T1e = T12 * T1d;
Chris@42 218 T1U = T1J * T1H;
Chris@42 219 T2h = FNMS(KP951056516, T1Y, T1X);
Chris@42 220 T1Z = FMA(KP951056516, T1Y, T1X);
Chris@42 221 T1W = W[6];
Chris@42 222 T21 = W[7];
Chris@42 223 T2g = W[10];
Chris@42 224 T2j = W[11];
Chris@42 225 T20 = T1W * T1Z;
Chris@42 226 T2e = T21 * T1Z;
Chris@42 227 T2i = T2g * T2h;
Chris@42 228 T2u = T2j * T2h;
Chris@42 229 }
Chris@42 230 }
Chris@42 231 {
Chris@42 232 E T1D, T1F, T1L, T1R;
Chris@42 233 {
Chris@42 234 E T11, T2x, T1r, T1B;
Chris@42 235 T11 = FMA(TN, T10, TM);
Chris@42 236 T2x = FNMS(Tk, T10, T2w);
Chris@42 237 T1r = FNMS(T1f, T1q, T1e);
Chris@42 238 T1B = FMA(T1x, T1A, T1w);
Chris@42 239 Rm[0] = Tj + T11;
Chris@42 240 Rp[0] = Tj - T11;
Chris@42 241 Ip[0] = T2x + T2y;
Chris@42 242 Im[0] = T2x - T2y;
Chris@42 243 Rp[WS(rs, 1)] = T1r - T1B;
Chris@42 244 Rm[WS(rs, 1)] = T1B + T1r;
Chris@42 245 T1D = FNMS(T1s, T1A, T1C);
Chris@42 246 T1F = FMA(T12, T1q, T1E);
Chris@42 247 T1L = FNMS(T1J, T1K, T1I);
Chris@42 248 T1R = FMA(T1P, T1Q, T1O);
Chris@42 249 }
Chris@42 250 {
Chris@42 251 E T1T, T1V, T2t, T2v;
Chris@42 252 T1T = FNMS(T1M, T1Q, T1S);
Chris@42 253 Ip[WS(rs, 1)] = T1D + T1F;
Chris@42 254 Im[WS(rs, 1)] = T1D - T1F;
Chris@42 255 Rm[WS(rs, 4)] = T1R + T1L;
Chris@42 256 Rp[WS(rs, 4)] = T1L - T1R;
Chris@42 257 T1V = FMA(T1G, T1K, T1U);
Chris@42 258 T2t = FNMS(T2m, T2q, T2s);
Chris@42 259 T2v = FMA(T2g, T2k, T2u);
Chris@42 260 {
Chris@42 261 E T2l, T2r, T25, T2b;
Chris@42 262 T2l = FNMS(T2j, T2k, T2i);
Chris@42 263 Ip[WS(rs, 4)] = T1T + T1V;
Chris@42 264 Im[WS(rs, 4)] = T1T - T1V;
Chris@42 265 Ip[WS(rs, 3)] = T2t + T2v;
Chris@42 266 Im[WS(rs, 3)] = T2t - T2v;
Chris@42 267 T2r = FMA(T2p, T2q, T2o);
Chris@42 268 T25 = FNMS(T21, T24, T20);
Chris@42 269 T2b = FMA(T29, T2a, T28);
Chris@42 270 T2d = FNMS(T26, T2a, T2c);
Chris@42 271 Rm[WS(rs, 3)] = T2r + T2l;
Chris@42 272 Rp[WS(rs, 3)] = T2l - T2r;
Chris@42 273 Rm[WS(rs, 2)] = T2b + T25;
Chris@42 274 Rp[WS(rs, 2)] = T25 - T2b;
Chris@42 275 T2f = FMA(T1W, T24, T2e);
Chris@42 276 }
Chris@42 277 }
Chris@42 278 }
Chris@42 279 }
Chris@42 280 }
Chris@42 281 Ip[WS(rs, 2)] = T2d + T2f;
Chris@42 282 Im[WS(rs, 2)] = T2d - T2f;
Chris@42 283 }
Chris@42 284 }
Chris@42 285 }
Chris@42 286
Chris@42 287 static const tw_instr twinstr[] = {
Chris@42 288 {TW_FULL, 1, 10},
Chris@42 289 {TW_NEXT, 1, 0}
Chris@42 290 };
Chris@42 291
Chris@42 292 static const hc2c_desc desc = { 10, "hc2cbdft_10", twinstr, &GENUS, {68, 18, 54, 0} };
Chris@42 293
Chris@42 294 void X(codelet_hc2cbdft_10) (planner *p) {
Chris@42 295 X(khc2c_register) (p, hc2cbdft_10, &desc, HC2C_VIA_DFT);
Chris@42 296 }
Chris@42 297 #else /* HAVE_FMA */
Chris@42 298
Chris@42 299 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 10 -dif -name hc2cbdft_10 -include hc2cb.h */
Chris@42 300
Chris@42 301 /*
Chris@42 302 * This function contains 122 FP additions, 60 FP multiplications,
Chris@42 303 * (or, 92 additions, 30 multiplications, 30 fused multiply/add),
Chris@42 304 * 61 stack variables, 4 constants, and 40 memory accesses
Chris@42 305 */
Chris@42 306 #include "hc2cb.h"
Chris@42 307
Chris@42 308 static void hc2cbdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 309 {
Chris@42 310 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 311 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 312 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 313 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 314 {
Chris@42 315 INT m;
Chris@42 316 for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 317 E T3, TS, TR, T13, Ti, T12, TT, TU, T1g, T1T, Tr, T1s, TJ, T1h, TG;
Chris@42 318 E T1m, TK, TL, T1k, T1l, T1b, T1P, TY, T1w;
Chris@42 319 {
Chris@42 320 E Td, To, Tg, Tp, Th, TQ, T6, Tl, T9, Tm, Ta, TP, T1, T2;
Chris@42 321 T1 = Rp[0];
Chris@42 322 T2 = Rm[WS(rs, 4)];
Chris@42 323 T3 = T1 + T2;
Chris@42 324 TS = T1 - T2;
Chris@42 325 {
Chris@42 326 E Tb, Tc, Te, Tf;
Chris@42 327 Tb = Rp[WS(rs, 4)];
Chris@42 328 Tc = Rm[0];
Chris@42 329 Td = Tb + Tc;
Chris@42 330 To = Tb - Tc;
Chris@42 331 Te = Rm[WS(rs, 3)];
Chris@42 332 Tf = Rp[WS(rs, 1)];
Chris@42 333 Tg = Te + Tf;
Chris@42 334 Tp = Te - Tf;
Chris@42 335 }
Chris@42 336 Th = Td + Tg;
Chris@42 337 TQ = To + Tp;
Chris@42 338 {
Chris@42 339 E T4, T5, T7, T8;
Chris@42 340 T4 = Rp[WS(rs, 2)];
Chris@42 341 T5 = Rm[WS(rs, 2)];
Chris@42 342 T6 = T4 + T5;
Chris@42 343 Tl = T4 - T5;
Chris@42 344 T7 = Rm[WS(rs, 1)];
Chris@42 345 T8 = Rp[WS(rs, 3)];
Chris@42 346 T9 = T7 + T8;
Chris@42 347 Tm = T7 - T8;
Chris@42 348 }
Chris@42 349 Ta = T6 + T9;
Chris@42 350 TP = Tl + Tm;
Chris@42 351 TR = KP559016994 * (TP - TQ);
Chris@42 352 T13 = KP559016994 * (Ta - Th);
Chris@42 353 Ti = Ta + Th;
Chris@42 354 T12 = FNMS(KP250000000, Ti, T3);
Chris@42 355 TT = TP + TQ;
Chris@42 356 TU = FNMS(KP250000000, TT, TS);
Chris@42 357 {
Chris@42 358 E T1e, T1f, Tn, Tq;
Chris@42 359 T1e = T6 - T9;
Chris@42 360 T1f = Td - Tg;
Chris@42 361 T1g = FNMS(KP951056516, T1f, KP587785252 * T1e);
Chris@42 362 T1T = FMA(KP951056516, T1e, KP587785252 * T1f);
Chris@42 363 Tn = Tl - Tm;
Chris@42 364 Tq = To - Tp;
Chris@42 365 Tr = FMA(KP951056516, Tn, KP587785252 * Tq);
Chris@42 366 T1s = FNMS(KP951056516, Tq, KP587785252 * Tn);
Chris@42 367 }
Chris@42 368 }
Chris@42 369 {
Chris@42 370 E TB, T18, TE, T19, TF, T1j, Tu, T15, Tx, T16, Ty, T1i, TH, TI;
Chris@42 371 TH = Ip[0];
Chris@42 372 TI = Im[WS(rs, 4)];
Chris@42 373 TJ = TH + TI;
Chris@42 374 T1h = TH - TI;
Chris@42 375 {
Chris@42 376 E Tz, TA, TC, TD;
Chris@42 377 Tz = Ip[WS(rs, 4)];
Chris@42 378 TA = Im[0];
Chris@42 379 TB = Tz + TA;
Chris@42 380 T18 = Tz - TA;
Chris@42 381 TC = Im[WS(rs, 3)];
Chris@42 382 TD = Ip[WS(rs, 1)];
Chris@42 383 TE = TC + TD;
Chris@42 384 T19 = TD - TC;
Chris@42 385 }
Chris@42 386 TF = TB - TE;
Chris@42 387 T1j = T18 + T19;
Chris@42 388 {
Chris@42 389 E Ts, Tt, Tv, Tw;
Chris@42 390 Ts = Ip[WS(rs, 2)];
Chris@42 391 Tt = Im[WS(rs, 2)];
Chris@42 392 Tu = Ts + Tt;
Chris@42 393 T15 = Ts - Tt;
Chris@42 394 Tv = Im[WS(rs, 1)];
Chris@42 395 Tw = Ip[WS(rs, 3)];
Chris@42 396 Tx = Tv + Tw;
Chris@42 397 T16 = Tw - Tv;
Chris@42 398 }
Chris@42 399 Ty = Tu - Tx;
Chris@42 400 T1i = T15 + T16;
Chris@42 401 TG = KP559016994 * (Ty - TF);
Chris@42 402 T1m = KP559016994 * (T1i - T1j);
Chris@42 403 TK = Ty + TF;
Chris@42 404 TL = FNMS(KP250000000, TK, TJ);
Chris@42 405 T1k = T1i + T1j;
Chris@42 406 T1l = FNMS(KP250000000, T1k, T1h);
Chris@42 407 {
Chris@42 408 E T17, T1a, TW, TX;
Chris@42 409 T17 = T15 - T16;
Chris@42 410 T1a = T18 - T19;
Chris@42 411 T1b = FNMS(KP951056516, T1a, KP587785252 * T17);
Chris@42 412 T1P = FMA(KP951056516, T17, KP587785252 * T1a);
Chris@42 413 TW = Tu + Tx;
Chris@42 414 TX = TB + TE;
Chris@42 415 TY = FMA(KP951056516, TW, KP587785252 * TX);
Chris@42 416 T1w = FNMS(KP951056516, TX, KP587785252 * TW);
Chris@42 417 }
Chris@42 418 }
Chris@42 419 {
Chris@42 420 E Tj, T2g, TN, T1H, T1U, T26, TZ, T1J, T1Q, T24, T1c, T1C, T1t, T29, T1o;
Chris@42 421 E T1E, T1x, T2b, T20, T21, TM, T1S, TV;
Chris@42 422 Tj = T3 + Ti;
Chris@42 423 T2g = T1h + T1k;
Chris@42 424 TM = TG + TL;
Chris@42 425 TN = Tr + TM;
Chris@42 426 T1H = TM - Tr;
Chris@42 427 T1S = T1m + T1l;
Chris@42 428 T1U = T1S - T1T;
Chris@42 429 T26 = T1T + T1S;
Chris@42 430 TV = TR + TU;
Chris@42 431 TZ = TV - TY;
Chris@42 432 T1J = TV + TY;
Chris@42 433 {
Chris@42 434 E T1O, T14, T1r, T1n, T1v;
Chris@42 435 T1O = T13 + T12;
Chris@42 436 T1Q = T1O + T1P;
Chris@42 437 T24 = T1O - T1P;
Chris@42 438 T14 = T12 - T13;
Chris@42 439 T1c = T14 - T1b;
Chris@42 440 T1C = T14 + T1b;
Chris@42 441 T1r = TL - TG;
Chris@42 442 T1t = T1r - T1s;
Chris@42 443 T29 = T1s + T1r;
Chris@42 444 T1n = T1l - T1m;
Chris@42 445 T1o = T1g + T1n;
Chris@42 446 T1E = T1n - T1g;
Chris@42 447 T1v = TU - TR;
Chris@42 448 T1x = T1v + T1w;
Chris@42 449 T2b = T1v - T1w;
Chris@42 450 {
Chris@42 451 E T1X, T1Z, T1W, T1Y;
Chris@42 452 T1X = TS + TT;
Chris@42 453 T1Z = TJ + TK;
Chris@42 454 T1W = W[9];
Chris@42 455 T1Y = W[8];
Chris@42 456 T20 = FMA(T1W, T1X, T1Y * T1Z);
Chris@42 457 T21 = FNMS(T1W, T1Z, T1Y * T1X);
Chris@42 458 }
Chris@42 459 }
Chris@42 460 {
Chris@42 461 E T10, T2f, Tk, TO;
Chris@42 462 Tk = W[0];
Chris@42 463 TO = W[1];
Chris@42 464 T10 = FMA(Tk, TN, TO * TZ);
Chris@42 465 T2f = FNMS(TO, TN, Tk * TZ);
Chris@42 466 Rp[0] = Tj - T10;
Chris@42 467 Ip[0] = T2f + T2g;
Chris@42 468 Rm[0] = Tj + T10;
Chris@42 469 Im[0] = T2f - T2g;
Chris@42 470 }
Chris@42 471 {
Chris@42 472 E T1V, T22, T1N, T1R;
Chris@42 473 T1N = W[6];
Chris@42 474 T1R = W[7];
Chris@42 475 T1V = FNMS(T1R, T1U, T1N * T1Q);
Chris@42 476 T22 = FMA(T1R, T1Q, T1N * T1U);
Chris@42 477 Rp[WS(rs, 2)] = T1V - T20;
Chris@42 478 Ip[WS(rs, 2)] = T21 + T22;
Chris@42 479 Rm[WS(rs, 2)] = T20 + T1V;
Chris@42 480 Im[WS(rs, 2)] = T21 - T22;
Chris@42 481 }
Chris@42 482 {
Chris@42 483 E T1p, T1A, T1y, T1z;
Chris@42 484 {
Chris@42 485 E T11, T1d, T1q, T1u;
Chris@42 486 T11 = W[2];
Chris@42 487 T1d = W[3];
Chris@42 488 T1p = FNMS(T1d, T1o, T11 * T1c);
Chris@42 489 T1A = FMA(T1d, T1c, T11 * T1o);
Chris@42 490 T1q = W[4];
Chris@42 491 T1u = W[5];
Chris@42 492 T1y = FMA(T1q, T1t, T1u * T1x);
Chris@42 493 T1z = FNMS(T1u, T1t, T1q * T1x);
Chris@42 494 }
Chris@42 495 Rp[WS(rs, 1)] = T1p - T1y;
Chris@42 496 Ip[WS(rs, 1)] = T1z + T1A;
Chris@42 497 Rm[WS(rs, 1)] = T1y + T1p;
Chris@42 498 Im[WS(rs, 1)] = T1z - T1A;
Chris@42 499 }
Chris@42 500 {
Chris@42 501 E T1F, T1M, T1K, T1L;
Chris@42 502 {
Chris@42 503 E T1B, T1D, T1G, T1I;
Chris@42 504 T1B = W[14];
Chris@42 505 T1D = W[15];
Chris@42 506 T1F = FNMS(T1D, T1E, T1B * T1C);
Chris@42 507 T1M = FMA(T1D, T1C, T1B * T1E);
Chris@42 508 T1G = W[16];
Chris@42 509 T1I = W[17];
Chris@42 510 T1K = FMA(T1G, T1H, T1I * T1J);
Chris@42 511 T1L = FNMS(T1I, T1H, T1G * T1J);
Chris@42 512 }
Chris@42 513 Rp[WS(rs, 4)] = T1F - T1K;
Chris@42 514 Ip[WS(rs, 4)] = T1L + T1M;
Chris@42 515 Rm[WS(rs, 4)] = T1K + T1F;
Chris@42 516 Im[WS(rs, 4)] = T1L - T1M;
Chris@42 517 }
Chris@42 518 {
Chris@42 519 E T27, T2e, T2c, T2d;
Chris@42 520 {
Chris@42 521 E T23, T25, T28, T2a;
Chris@42 522 T23 = W[10];
Chris@42 523 T25 = W[11];
Chris@42 524 T27 = FNMS(T25, T26, T23 * T24);
Chris@42 525 T2e = FMA(T25, T24, T23 * T26);
Chris@42 526 T28 = W[12];
Chris@42 527 T2a = W[13];
Chris@42 528 T2c = FMA(T28, T29, T2a * T2b);
Chris@42 529 T2d = FNMS(T2a, T29, T28 * T2b);
Chris@42 530 }
Chris@42 531 Rp[WS(rs, 3)] = T27 - T2c;
Chris@42 532 Ip[WS(rs, 3)] = T2d + T2e;
Chris@42 533 Rm[WS(rs, 3)] = T2c + T27;
Chris@42 534 Im[WS(rs, 3)] = T2d - T2e;
Chris@42 535 }
Chris@42 536 }
Chris@42 537 }
Chris@42 538 }
Chris@42 539 }
Chris@42 540
Chris@42 541 static const tw_instr twinstr[] = {
Chris@42 542 {TW_FULL, 1, 10},
Chris@42 543 {TW_NEXT, 1, 0}
Chris@42 544 };
Chris@42 545
Chris@42 546 static const hc2c_desc desc = { 10, "hc2cbdft_10", twinstr, &GENUS, {92, 30, 30, 0} };
Chris@42 547
Chris@42 548 void X(codelet_hc2cbdft_10) (planner *p) {
Chris@42 549 X(khc2c_register) (p, hc2cbdft_10, &desc, HC2C_VIA_DFT);
Chris@42 550 }
Chris@42 551 #endif /* HAVE_FMA */