annotate src/fftw-3.3.5/dft/scalar/codelets/q1_4.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:37:39 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include q.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 88 FP additions, 48 FP multiplications,
Chris@42 32 * (or, 64 additions, 24 multiplications, 24 fused multiply/add),
Chris@42 33 * 76 stack variables, 0 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "q.h"
Chris@42 36
Chris@42 37 static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 {
Chris@42 40 INT m;
Chris@42 41 for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@42 42 E T1X, T1S, T1L, T1Y, T1R;
Chris@42 43 {
Chris@42 44 E T3, Tf, Tv, Ti, Tw, Tx, T6, Tm, Tc, Ts, T1T, T1H, T29, T1W, T2a;
Chris@42 45 E T2b, T1K, T20, T1Q, T26, TN, TB, T13, TQ, T14, T15, TE, TU, TK, T10;
Chris@42 46 E T1l, T19, T1a, T1h, T1B, T1o, T1C, T1b, T1D, T1e, T1c;
Chris@42 47 {
Chris@42 48 E T1I, T1P, T1J, T1M;
Chris@42 49 {
Chris@42 50 E Tb, T4, T5, T8;
Chris@42 51 {
Chris@42 52 E T1, T2, T9, Ta, Tg, Th;
Chris@42 53 T1 = rio[0];
Chris@42 54 T2 = rio[WS(rs, 2)];
Chris@42 55 T9 = iio[0];
Chris@42 56 Ta = iio[WS(rs, 2)];
Chris@42 57 Tg = iio[WS(rs, 1)];
Chris@42 58 T3 = T1 + T2;
Chris@42 59 Tf = T1 - T2;
Chris@42 60 Th = iio[WS(rs, 3)];
Chris@42 61 Tv = T9 + Ta;
Chris@42 62 Tb = T9 - Ta;
Chris@42 63 T4 = rio[WS(rs, 1)];
Chris@42 64 Ti = Tg - Th;
Chris@42 65 Tw = Tg + Th;
Chris@42 66 T5 = rio[WS(rs, 3)];
Chris@42 67 }
Chris@42 68 Tx = Tv - Tw;
Chris@42 69 T8 = T4 - T5;
Chris@42 70 T6 = T4 + T5;
Chris@42 71 {
Chris@42 72 E T1N, T1O, T1F, T1G, T1U, T1V;
Chris@42 73 T1F = rio[WS(vs, 3)];
Chris@42 74 T1G = rio[WS(vs, 3) + WS(rs, 2)];
Chris@42 75 Tm = Tb - T8;
Chris@42 76 Tc = T8 + Tb;
Chris@42 77 Ts = T3 - T6;
Chris@42 78 T1T = T1F - T1G;
Chris@42 79 T1H = T1F + T1G;
Chris@42 80 T1N = iio[WS(vs, 3)];
Chris@42 81 T1O = iio[WS(vs, 3) + WS(rs, 2)];
Chris@42 82 T1U = iio[WS(vs, 3) + WS(rs, 1)];
Chris@42 83 T1V = iio[WS(vs, 3) + WS(rs, 3)];
Chris@42 84 T1I = rio[WS(vs, 3) + WS(rs, 1)];
Chris@42 85 T1P = T1N - T1O;
Chris@42 86 T29 = T1N + T1O;
Chris@42 87 T1W = T1U - T1V;
Chris@42 88 T2a = T1U + T1V;
Chris@42 89 T1J = rio[WS(vs, 3) + WS(rs, 3)];
Chris@42 90 }
Chris@42 91 }
Chris@42 92 T2b = T29 - T2a;
Chris@42 93 T1M = T1I - T1J;
Chris@42 94 T1K = T1I + T1J;
Chris@42 95 {
Chris@42 96 E TC, TJ, TD, TG;
Chris@42 97 {
Chris@42 98 E TH, TI, Tz, TA, TO, TP;
Chris@42 99 Tz = rio[WS(vs, 1)];
Chris@42 100 TA = rio[WS(vs, 1) + WS(rs, 2)];
Chris@42 101 T20 = T1P - T1M;
Chris@42 102 T1Q = T1M + T1P;
Chris@42 103 T26 = T1H - T1K;
Chris@42 104 TN = Tz - TA;
Chris@42 105 TB = Tz + TA;
Chris@42 106 TH = iio[WS(vs, 1)];
Chris@42 107 TI = iio[WS(vs, 1) + WS(rs, 2)];
Chris@42 108 TO = iio[WS(vs, 1) + WS(rs, 1)];
Chris@42 109 TP = iio[WS(vs, 1) + WS(rs, 3)];
Chris@42 110 TC = rio[WS(vs, 1) + WS(rs, 1)];
Chris@42 111 TJ = TH - TI;
Chris@42 112 T13 = TH + TI;
Chris@42 113 TQ = TO - TP;
Chris@42 114 T14 = TO + TP;
Chris@42 115 TD = rio[WS(vs, 1) + WS(rs, 3)];
Chris@42 116 }
Chris@42 117 T15 = T13 - T14;
Chris@42 118 TG = TC - TD;
Chris@42 119 TE = TC + TD;
Chris@42 120 {
Chris@42 121 E T1f, T1g, T17, T18, T1m, T1n;
Chris@42 122 T17 = rio[WS(vs, 2)];
Chris@42 123 T18 = rio[WS(vs, 2) + WS(rs, 2)];
Chris@42 124 TU = TJ - TG;
Chris@42 125 TK = TG + TJ;
Chris@42 126 T10 = TB - TE;
Chris@42 127 T1l = T17 - T18;
Chris@42 128 T19 = T17 + T18;
Chris@42 129 T1f = iio[WS(vs, 2)];
Chris@42 130 T1g = iio[WS(vs, 2) + WS(rs, 2)];
Chris@42 131 T1m = iio[WS(vs, 2) + WS(rs, 1)];
Chris@42 132 T1n = iio[WS(vs, 2) + WS(rs, 3)];
Chris@42 133 T1a = rio[WS(vs, 2) + WS(rs, 1)];
Chris@42 134 T1h = T1f - T1g;
Chris@42 135 T1B = T1f + T1g;
Chris@42 136 T1o = T1m - T1n;
Chris@42 137 T1C = T1m + T1n;
Chris@42 138 T1b = rio[WS(vs, 2) + WS(rs, 3)];
Chris@42 139 }
Chris@42 140 }
Chris@42 141 }
Chris@42 142 T1D = T1B - T1C;
Chris@42 143 T1e = T1a - T1b;
Chris@42 144 T1c = T1a + T1b;
Chris@42 145 {
Chris@42 146 E T1s, T1i, T1y, T28, T27, Tr, Tu;
Chris@42 147 rio[0] = T3 + T6;
Chris@42 148 iio[0] = Tv + Tw;
Chris@42 149 T1s = T1h - T1e;
Chris@42 150 T1i = T1e + T1h;
Chris@42 151 T1y = T19 - T1c;
Chris@42 152 rio[WS(rs, 1)] = TB + TE;
Chris@42 153 iio[WS(rs, 1)] = T13 + T14;
Chris@42 154 rio[WS(rs, 2)] = T19 + T1c;
Chris@42 155 iio[WS(rs, 2)] = T1B + T1C;
Chris@42 156 iio[WS(rs, 3)] = T29 + T2a;
Chris@42 157 rio[WS(rs, 3)] = T1H + T1K;
Chris@42 158 Tr = W[2];
Chris@42 159 Tu = W[3];
Chris@42 160 {
Chris@42 161 E T25, Ty, Tt, T2c;
Chris@42 162 T25 = W[2];
Chris@42 163 T28 = W[3];
Chris@42 164 Ty = Tr * Tx;
Chris@42 165 Tt = Tr * Ts;
Chris@42 166 T2c = T25 * T2b;
Chris@42 167 T27 = T25 * T26;
Chris@42 168 iio[WS(vs, 2)] = FNMS(Tu, Ts, Ty);
Chris@42 169 rio[WS(vs, 2)] = FMA(Tu, Tx, Tt);
Chris@42 170 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T28, T26, T2c);
Chris@42 171 }
Chris@42 172 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T28, T2b, T27);
Chris@42 173 {
Chris@42 174 E Tp, T1v, T23, T22, T1Z, TR, TM, TF;
Chris@42 175 {
Chris@42 176 E T1A, T1z, TZ, T12;
Chris@42 177 TZ = W[2];
Chris@42 178 T12 = W[3];
Chris@42 179 {
Chris@42 180 E T1x, T16, T11, T1E;
Chris@42 181 T1x = W[2];
Chris@42 182 T1A = W[3];
Chris@42 183 T16 = TZ * T15;
Chris@42 184 T11 = TZ * T10;
Chris@42 185 T1E = T1x * T1D;
Chris@42 186 T1z = T1x * T1y;
Chris@42 187 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T12, T10, T16);
Chris@42 188 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T12, T15, T11);
Chris@42 189 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1A, T1y, T1E);
Chris@42 190 }
Chris@42 191 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1A, T1D, T1z);
Chris@42 192 {
Chris@42 193 E Tj, Te, T7, T1p, T1k, T1j;
Chris@42 194 Tp = Tf + Ti;
Chris@42 195 Tj = Tf - Ti;
Chris@42 196 Te = W[5];
Chris@42 197 T7 = W[4];
Chris@42 198 {
Chris@42 199 E T1d, T1q, Tk, Td;
Chris@42 200 T1p = T1l - T1o;
Chris@42 201 T1v = T1l + T1o;
Chris@42 202 T1k = W[5];
Chris@42 203 Tk = Te * Tc;
Chris@42 204 Td = T7 * Tc;
Chris@42 205 T1d = W[4];
Chris@42 206 T1q = T1k * T1i;
Chris@42 207 rio[WS(vs, 3)] = FMA(T7, Tj, Tk);
Chris@42 208 iio[WS(vs, 3)] = FNMS(Te, Tj, Td);
Chris@42 209 T1j = T1d * T1i;
Chris@42 210 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T1d, T1p, T1q);
Chris@42 211 }
Chris@42 212 T23 = T1T + T1W;
Chris@42 213 T1X = T1T - T1W;
Chris@42 214 T22 = W[1];
Chris@42 215 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T1k, T1p, T1j);
Chris@42 216 T1Z = W[0];
Chris@42 217 }
Chris@42 218 }
Chris@42 219 {
Chris@42 220 E TX, TW, TT, TY, TV, T24, T21;
Chris@42 221 TX = TN + TQ;
Chris@42 222 TR = TN - TQ;
Chris@42 223 T24 = T22 * T20;
Chris@42 224 TW = W[1];
Chris@42 225 T21 = T1Z * T20;
Chris@42 226 TT = W[0];
Chris@42 227 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1Z, T23, T24);
Chris@42 228 TY = TW * TU;
Chris@42 229 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T22, T23, T21);
Chris@42 230 TV = TT * TU;
Chris@42 231 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TT, TX, TY);
Chris@42 232 TM = W[5];
Chris@42 233 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TW, TX, TV);
Chris@42 234 TF = W[4];
Chris@42 235 }
Chris@42 236 {
Chris@42 237 E To, Tl, Tq, Tn, TS, TL;
Chris@42 238 TS = TM * TK;
Chris@42 239 To = W[1];
Chris@42 240 TL = TF * TK;
Chris@42 241 Tl = W[0];
Chris@42 242 rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TR, TS);
Chris@42 243 Tq = To * Tm;
Chris@42 244 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TM, TR, TL);
Chris@42 245 Tn = Tl * Tm;
Chris@42 246 {
Chris@42 247 E T1u, T1r, T1w, T1t;
Chris@42 248 rio[WS(vs, 1)] = FMA(Tl, Tp, Tq);
Chris@42 249 T1u = W[1];
Chris@42 250 iio[WS(vs, 1)] = FNMS(To, Tp, Tn);
Chris@42 251 T1r = W[0];
Chris@42 252 T1w = T1u * T1s;
Chris@42 253 T1S = W[5];
Chris@42 254 T1t = T1r * T1s;
Chris@42 255 T1L = W[4];
Chris@42 256 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1r, T1v, T1w);
Chris@42 257 T1Y = T1S * T1Q;
Chris@42 258 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1u, T1v, T1t);
Chris@42 259 T1R = T1L * T1Q;
Chris@42 260 }
Chris@42 261 }
Chris@42 262 }
Chris@42 263 }
Chris@42 264 }
Chris@42 265 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1L, T1X, T1Y);
Chris@42 266 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1S, T1X, T1R);
Chris@42 267 }
Chris@42 268 }
Chris@42 269 }
Chris@42 270
Chris@42 271 static const tw_instr twinstr[] = {
Chris@42 272 {TW_FULL, 0, 4},
Chris@42 273 {TW_NEXT, 1, 0}
Chris@42 274 };
Chris@42 275
Chris@42 276 static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, {64, 24, 24, 0}, 0, 0, 0 };
Chris@42 277
Chris@42 278 void X(codelet_q1_4) (planner *p) {
Chris@42 279 X(kdft_difsq_register) (p, q1_4, &desc);
Chris@42 280 }
Chris@42 281 #else /* HAVE_FMA */
Chris@42 282
Chris@42 283 /* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include q.h */
Chris@42 284
Chris@42 285 /*
Chris@42 286 * This function contains 88 FP additions, 48 FP multiplications,
Chris@42 287 * (or, 64 additions, 24 multiplications, 24 fused multiply/add),
Chris@42 288 * 37 stack variables, 0 constants, and 64 memory accesses
Chris@42 289 */
Chris@42 290 #include "q.h"
Chris@42 291
Chris@42 292 static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@42 293 {
Chris@42 294 {
Chris@42 295 INT m;
Chris@42 296 for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@42 297 E T3, Te, Tb, Tq, T6, T8, Th, Tr, Tv, TG, TD, TS, Ty, TA, TJ;
Chris@42 298 E TT, TX, T18, T15, T1k, T10, T12, T1b, T1l, T1p, T1A, T1x, T1M, T1s, T1u;
Chris@42 299 E T1D, T1N;
Chris@42 300 {
Chris@42 301 E T1, T2, T9, Ta;
Chris@42 302 T1 = rio[0];
Chris@42 303 T2 = rio[WS(rs, 2)];
Chris@42 304 T3 = T1 + T2;
Chris@42 305 Te = T1 - T2;
Chris@42 306 T9 = iio[0];
Chris@42 307 Ta = iio[WS(rs, 2)];
Chris@42 308 Tb = T9 - Ta;
Chris@42 309 Tq = T9 + Ta;
Chris@42 310 }
Chris@42 311 {
Chris@42 312 E T4, T5, Tf, Tg;
Chris@42 313 T4 = rio[WS(rs, 1)];
Chris@42 314 T5 = rio[WS(rs, 3)];
Chris@42 315 T6 = T4 + T5;
Chris@42 316 T8 = T4 - T5;
Chris@42 317 Tf = iio[WS(rs, 1)];
Chris@42 318 Tg = iio[WS(rs, 3)];
Chris@42 319 Th = Tf - Tg;
Chris@42 320 Tr = Tf + Tg;
Chris@42 321 }
Chris@42 322 {
Chris@42 323 E Tt, Tu, TB, TC;
Chris@42 324 Tt = rio[WS(vs, 1)];
Chris@42 325 Tu = rio[WS(vs, 1) + WS(rs, 2)];
Chris@42 326 Tv = Tt + Tu;
Chris@42 327 TG = Tt - Tu;
Chris@42 328 TB = iio[WS(vs, 1)];
Chris@42 329 TC = iio[WS(vs, 1) + WS(rs, 2)];
Chris@42 330 TD = TB - TC;
Chris@42 331 TS = TB + TC;
Chris@42 332 }
Chris@42 333 {
Chris@42 334 E Tw, Tx, TH, TI;
Chris@42 335 Tw = rio[WS(vs, 1) + WS(rs, 1)];
Chris@42 336 Tx = rio[WS(vs, 1) + WS(rs, 3)];
Chris@42 337 Ty = Tw + Tx;
Chris@42 338 TA = Tw - Tx;
Chris@42 339 TH = iio[WS(vs, 1) + WS(rs, 1)];
Chris@42 340 TI = iio[WS(vs, 1) + WS(rs, 3)];
Chris@42 341 TJ = TH - TI;
Chris@42 342 TT = TH + TI;
Chris@42 343 }
Chris@42 344 {
Chris@42 345 E TV, TW, T13, T14;
Chris@42 346 TV = rio[WS(vs, 2)];
Chris@42 347 TW = rio[WS(vs, 2) + WS(rs, 2)];
Chris@42 348 TX = TV + TW;
Chris@42 349 T18 = TV - TW;
Chris@42 350 T13 = iio[WS(vs, 2)];
Chris@42 351 T14 = iio[WS(vs, 2) + WS(rs, 2)];
Chris@42 352 T15 = T13 - T14;
Chris@42 353 T1k = T13 + T14;
Chris@42 354 }
Chris@42 355 {
Chris@42 356 E TY, TZ, T19, T1a;
Chris@42 357 TY = rio[WS(vs, 2) + WS(rs, 1)];
Chris@42 358 TZ = rio[WS(vs, 2) + WS(rs, 3)];
Chris@42 359 T10 = TY + TZ;
Chris@42 360 T12 = TY - TZ;
Chris@42 361 T19 = iio[WS(vs, 2) + WS(rs, 1)];
Chris@42 362 T1a = iio[WS(vs, 2) + WS(rs, 3)];
Chris@42 363 T1b = T19 - T1a;
Chris@42 364 T1l = T19 + T1a;
Chris@42 365 }
Chris@42 366 {
Chris@42 367 E T1n, T1o, T1v, T1w;
Chris@42 368 T1n = rio[WS(vs, 3)];
Chris@42 369 T1o = rio[WS(vs, 3) + WS(rs, 2)];
Chris@42 370 T1p = T1n + T1o;
Chris@42 371 T1A = T1n - T1o;
Chris@42 372 T1v = iio[WS(vs, 3)];
Chris@42 373 T1w = iio[WS(vs, 3) + WS(rs, 2)];
Chris@42 374 T1x = T1v - T1w;
Chris@42 375 T1M = T1v + T1w;
Chris@42 376 }
Chris@42 377 {
Chris@42 378 E T1q, T1r, T1B, T1C;
Chris@42 379 T1q = rio[WS(vs, 3) + WS(rs, 1)];
Chris@42 380 T1r = rio[WS(vs, 3) + WS(rs, 3)];
Chris@42 381 T1s = T1q + T1r;
Chris@42 382 T1u = T1q - T1r;
Chris@42 383 T1B = iio[WS(vs, 3) + WS(rs, 1)];
Chris@42 384 T1C = iio[WS(vs, 3) + WS(rs, 3)];
Chris@42 385 T1D = T1B - T1C;
Chris@42 386 T1N = T1B + T1C;
Chris@42 387 }
Chris@42 388 rio[0] = T3 + T6;
Chris@42 389 iio[0] = Tq + Tr;
Chris@42 390 rio[WS(rs, 1)] = Tv + Ty;
Chris@42 391 iio[WS(rs, 1)] = TS + TT;
Chris@42 392 rio[WS(rs, 2)] = TX + T10;
Chris@42 393 iio[WS(rs, 2)] = T1k + T1l;
Chris@42 394 iio[WS(rs, 3)] = T1M + T1N;
Chris@42 395 rio[WS(rs, 3)] = T1p + T1s;
Chris@42 396 {
Chris@42 397 E Tc, Ti, T7, Td;
Chris@42 398 Tc = T8 + Tb;
Chris@42 399 Ti = Te - Th;
Chris@42 400 T7 = W[4];
Chris@42 401 Td = W[5];
Chris@42 402 iio[WS(vs, 3)] = FNMS(Td, Ti, T7 * Tc);
Chris@42 403 rio[WS(vs, 3)] = FMA(Td, Tc, T7 * Ti);
Chris@42 404 }
Chris@42 405 {
Chris@42 406 E T1K, T1O, T1J, T1L;
Chris@42 407 T1K = T1p - T1s;
Chris@42 408 T1O = T1M - T1N;
Chris@42 409 T1J = W[2];
Chris@42 410 T1L = W[3];
Chris@42 411 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T1J, T1K, T1L * T1O);
Chris@42 412 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T1L, T1K, T1J * T1O);
Chris@42 413 }
Chris@42 414 {
Chris@42 415 E Tk, Tm, Tj, Tl;
Chris@42 416 Tk = Tb - T8;
Chris@42 417 Tm = Te + Th;
Chris@42 418 Tj = W[0];
Chris@42 419 Tl = W[1];
Chris@42 420 iio[WS(vs, 1)] = FNMS(Tl, Tm, Tj * Tk);
Chris@42 421 rio[WS(vs, 1)] = FMA(Tl, Tk, Tj * Tm);
Chris@42 422 }
Chris@42 423 {
Chris@42 424 E To, Ts, Tn, Tp;
Chris@42 425 To = T3 - T6;
Chris@42 426 Ts = Tq - Tr;
Chris@42 427 Tn = W[2];
Chris@42 428 Tp = W[3];
Chris@42 429 rio[WS(vs, 2)] = FMA(Tn, To, Tp * Ts);
Chris@42 430 iio[WS(vs, 2)] = FNMS(Tp, To, Tn * Ts);
Chris@42 431 }
Chris@42 432 {
Chris@42 433 E T16, T1c, T11, T17;
Chris@42 434 T16 = T12 + T15;
Chris@42 435 T1c = T18 - T1b;
Chris@42 436 T11 = W[4];
Chris@42 437 T17 = W[5];
Chris@42 438 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T17, T1c, T11 * T16);
Chris@42 439 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T17, T16, T11 * T1c);
Chris@42 440 }
Chris@42 441 {
Chris@42 442 E T1G, T1I, T1F, T1H;
Chris@42 443 T1G = T1x - T1u;
Chris@42 444 T1I = T1A + T1D;
Chris@42 445 T1F = W[0];
Chris@42 446 T1H = W[1];
Chris@42 447 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T1H, T1I, T1F * T1G);
Chris@42 448 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1H, T1G, T1F * T1I);
Chris@42 449 }
Chris@42 450 {
Chris@42 451 E TQ, TU, TP, TR;
Chris@42 452 TQ = Tv - Ty;
Chris@42 453 TU = TS - TT;
Chris@42 454 TP = W[2];
Chris@42 455 TR = W[3];
Chris@42 456 rio[WS(vs, 2) + WS(rs, 1)] = FMA(TP, TQ, TR * TU);
Chris@42 457 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TR, TQ, TP * TU);
Chris@42 458 }
Chris@42 459 {
Chris@42 460 E T1e, T1g, T1d, T1f;
Chris@42 461 T1e = T15 - T12;
Chris@42 462 T1g = T18 + T1b;
Chris@42 463 T1d = W[0];
Chris@42 464 T1f = W[1];
Chris@42 465 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
Chris@42 466 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
Chris@42 467 }
Chris@42 468 {
Chris@42 469 E T1i, T1m, T1h, T1j;
Chris@42 470 T1i = TX - T10;
Chris@42 471 T1m = T1k - T1l;
Chris@42 472 T1h = W[2];
Chris@42 473 T1j = W[3];
Chris@42 474 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1h, T1i, T1j * T1m);
Chris@42 475 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1j, T1i, T1h * T1m);
Chris@42 476 }
Chris@42 477 {
Chris@42 478 E T1y, T1E, T1t, T1z;
Chris@42 479 T1y = T1u + T1x;
Chris@42 480 T1E = T1A - T1D;
Chris@42 481 T1t = W[4];
Chris@42 482 T1z = W[5];
Chris@42 483 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1z, T1E, T1t * T1y);
Chris@42 484 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1z, T1y, T1t * T1E);
Chris@42 485 }
Chris@42 486 {
Chris@42 487 E TM, TO, TL, TN;
Chris@42 488 TM = TD - TA;
Chris@42 489 TO = TG + TJ;
Chris@42 490 TL = W[0];
Chris@42 491 TN = W[1];
Chris@42 492 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TN, TO, TL * TM);
Chris@42 493 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TN, TM, TL * TO);
Chris@42 494 }
Chris@42 495 {
Chris@42 496 E TE, TK, Tz, TF;
Chris@42 497 TE = TA + TD;
Chris@42 498 TK = TG - TJ;
Chris@42 499 Tz = W[4];
Chris@42 500 TF = W[5];
Chris@42 501 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TF, TK, Tz * TE);
Chris@42 502 rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TE, Tz * TK);
Chris@42 503 }
Chris@42 504 }
Chris@42 505 }
Chris@42 506 }
Chris@42 507
Chris@42 508 static const tw_instr twinstr[] = {
Chris@42 509 {TW_FULL, 0, 4},
Chris@42 510 {TW_NEXT, 1, 0}
Chris@42 511 };
Chris@42 512
Chris@42 513 static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, {64, 24, 24, 0}, 0, 0, 0 };
Chris@42 514
Chris@42 515 void X(codelet_q1_4) (planner *p) {
Chris@42 516 X(kdft_difsq_register) (p, q1_4, &desc);
Chris@42 517 }
Chris@42 518 #endif /* HAVE_FMA */