annotate src/fftw-3.3.8/dft/scalar/codelets/q1_4.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:29 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 88 FP additions, 48 FP multiplications,
Chris@82 32 * (or, 64 additions, 24 multiplications, 24 fused multiply/add),
Chris@82 33 * 51 stack variables, 0 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/q.h"
Chris@82 36
Chris@82 37 static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 {
Chris@82 40 INT m;
Chris@82 41 for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@82 42 E T3, Tv, Tw, T6, Tc, Tf, Tx, Ts, Tm, Ti, T1H, T29, T2a, T1K, T1Q;
Chris@82 43 E T1T, T2b, T26, T20, T1W, TB, T13, T14, TE, TK, TN, T15, T10, TU, TQ;
Chris@82 44 E T19, T1B, T1C, T1c, T1i, T1l, T1D, T1y, T1s, T1o;
Chris@82 45 {
Chris@82 46 E T1, T2, Tb, Tg, Th, T8;
Chris@82 47 {
Chris@82 48 E T9, Ta, T4, T5;
Chris@82 49 T1 = rio[0];
Chris@82 50 T2 = rio[WS(rs, 2)];
Chris@82 51 T3 = T1 + T2;
Chris@82 52 T9 = iio[0];
Chris@82 53 Ta = iio[WS(rs, 2)];
Chris@82 54 Tb = T9 - Ta;
Chris@82 55 Tv = T9 + Ta;
Chris@82 56 Tg = iio[WS(rs, 1)];
Chris@82 57 Th = iio[WS(rs, 3)];
Chris@82 58 Tw = Tg + Th;
Chris@82 59 T4 = rio[WS(rs, 1)];
Chris@82 60 T5 = rio[WS(rs, 3)];
Chris@82 61 T6 = T4 + T5;
Chris@82 62 T8 = T4 - T5;
Chris@82 63 }
Chris@82 64 Tc = T8 + Tb;
Chris@82 65 Tf = T1 - T2;
Chris@82 66 Tx = Tv - Tw;
Chris@82 67 Ts = T3 - T6;
Chris@82 68 Tm = Tb - T8;
Chris@82 69 Ti = Tg - Th;
Chris@82 70 }
Chris@82 71 {
Chris@82 72 E T1F, T1G, T1P, T1U, T1V, T1M;
Chris@82 73 {
Chris@82 74 E T1N, T1O, T1I, T1J;
Chris@82 75 T1F = rio[WS(vs, 3)];
Chris@82 76 T1G = rio[WS(vs, 3) + WS(rs, 2)];
Chris@82 77 T1H = T1F + T1G;
Chris@82 78 T1N = iio[WS(vs, 3)];
Chris@82 79 T1O = iio[WS(vs, 3) + WS(rs, 2)];
Chris@82 80 T1P = T1N - T1O;
Chris@82 81 T29 = T1N + T1O;
Chris@82 82 T1U = iio[WS(vs, 3) + WS(rs, 1)];
Chris@82 83 T1V = iio[WS(vs, 3) + WS(rs, 3)];
Chris@82 84 T2a = T1U + T1V;
Chris@82 85 T1I = rio[WS(vs, 3) + WS(rs, 1)];
Chris@82 86 T1J = rio[WS(vs, 3) + WS(rs, 3)];
Chris@82 87 T1K = T1I + T1J;
Chris@82 88 T1M = T1I - T1J;
Chris@82 89 }
Chris@82 90 T1Q = T1M + T1P;
Chris@82 91 T1T = T1F - T1G;
Chris@82 92 T2b = T29 - T2a;
Chris@82 93 T26 = T1H - T1K;
Chris@82 94 T20 = T1P - T1M;
Chris@82 95 T1W = T1U - T1V;
Chris@82 96 }
Chris@82 97 {
Chris@82 98 E Tz, TA, TJ, TO, TP, TG;
Chris@82 99 {
Chris@82 100 E TH, TI, TC, TD;
Chris@82 101 Tz = rio[WS(vs, 1)];
Chris@82 102 TA = rio[WS(vs, 1) + WS(rs, 2)];
Chris@82 103 TB = Tz + TA;
Chris@82 104 TH = iio[WS(vs, 1)];
Chris@82 105 TI = iio[WS(vs, 1) + WS(rs, 2)];
Chris@82 106 TJ = TH - TI;
Chris@82 107 T13 = TH + TI;
Chris@82 108 TO = iio[WS(vs, 1) + WS(rs, 1)];
Chris@82 109 TP = iio[WS(vs, 1) + WS(rs, 3)];
Chris@82 110 T14 = TO + TP;
Chris@82 111 TC = rio[WS(vs, 1) + WS(rs, 1)];
Chris@82 112 TD = rio[WS(vs, 1) + WS(rs, 3)];
Chris@82 113 TE = TC + TD;
Chris@82 114 TG = TC - TD;
Chris@82 115 }
Chris@82 116 TK = TG + TJ;
Chris@82 117 TN = Tz - TA;
Chris@82 118 T15 = T13 - T14;
Chris@82 119 T10 = TB - TE;
Chris@82 120 TU = TJ - TG;
Chris@82 121 TQ = TO - TP;
Chris@82 122 }
Chris@82 123 {
Chris@82 124 E T17, T18, T1h, T1m, T1n, T1e;
Chris@82 125 {
Chris@82 126 E T1f, T1g, T1a, T1b;
Chris@82 127 T17 = rio[WS(vs, 2)];
Chris@82 128 T18 = rio[WS(vs, 2) + WS(rs, 2)];
Chris@82 129 T19 = T17 + T18;
Chris@82 130 T1f = iio[WS(vs, 2)];
Chris@82 131 T1g = iio[WS(vs, 2) + WS(rs, 2)];
Chris@82 132 T1h = T1f - T1g;
Chris@82 133 T1B = T1f + T1g;
Chris@82 134 T1m = iio[WS(vs, 2) + WS(rs, 1)];
Chris@82 135 T1n = iio[WS(vs, 2) + WS(rs, 3)];
Chris@82 136 T1C = T1m + T1n;
Chris@82 137 T1a = rio[WS(vs, 2) + WS(rs, 1)];
Chris@82 138 T1b = rio[WS(vs, 2) + WS(rs, 3)];
Chris@82 139 T1c = T1a + T1b;
Chris@82 140 T1e = T1a - T1b;
Chris@82 141 }
Chris@82 142 T1i = T1e + T1h;
Chris@82 143 T1l = T17 - T18;
Chris@82 144 T1D = T1B - T1C;
Chris@82 145 T1y = T19 - T1c;
Chris@82 146 T1s = T1h - T1e;
Chris@82 147 T1o = T1m - T1n;
Chris@82 148 }
Chris@82 149 rio[0] = T3 + T6;
Chris@82 150 iio[0] = Tv + Tw;
Chris@82 151 rio[WS(rs, 1)] = TB + TE;
Chris@82 152 iio[WS(rs, 1)] = T13 + T14;
Chris@82 153 rio[WS(rs, 2)] = T19 + T1c;
Chris@82 154 iio[WS(rs, 2)] = T1B + T1C;
Chris@82 155 iio[WS(rs, 3)] = T29 + T2a;
Chris@82 156 rio[WS(rs, 3)] = T1H + T1K;
Chris@82 157 {
Chris@82 158 E Tt, Ty, Tr, Tu;
Chris@82 159 Tr = W[2];
Chris@82 160 Tt = Tr * Ts;
Chris@82 161 Ty = Tr * Tx;
Chris@82 162 Tu = W[3];
Chris@82 163 rio[WS(vs, 2)] = FMA(Tu, Tx, Tt);
Chris@82 164 iio[WS(vs, 2)] = FNMS(Tu, Ts, Ty);
Chris@82 165 }
Chris@82 166 {
Chris@82 167 E T27, T2c, T25, T28;
Chris@82 168 T25 = W[2];
Chris@82 169 T27 = T25 * T26;
Chris@82 170 T2c = T25 * T2b;
Chris@82 171 T28 = W[3];
Chris@82 172 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T28, T2b, T27);
Chris@82 173 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T28, T26, T2c);
Chris@82 174 }
Chris@82 175 {
Chris@82 176 E T11, T16, TZ, T12;
Chris@82 177 TZ = W[2];
Chris@82 178 T11 = TZ * T10;
Chris@82 179 T16 = TZ * T15;
Chris@82 180 T12 = W[3];
Chris@82 181 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T12, T15, T11);
Chris@82 182 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T12, T10, T16);
Chris@82 183 }
Chris@82 184 {
Chris@82 185 E T1z, T1E, T1x, T1A;
Chris@82 186 T1x = W[2];
Chris@82 187 T1z = T1x * T1y;
Chris@82 188 T1E = T1x * T1D;
Chris@82 189 T1A = W[3];
Chris@82 190 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1A, T1D, T1z);
Chris@82 191 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1A, T1y, T1E);
Chris@82 192 }
Chris@82 193 {
Chris@82 194 E Tj, Te, Tk, T7, Td;
Chris@82 195 Tj = Tf - Ti;
Chris@82 196 Te = W[5];
Chris@82 197 Tk = Te * Tc;
Chris@82 198 T7 = W[4];
Chris@82 199 Td = T7 * Tc;
Chris@82 200 iio[WS(vs, 3)] = FNMS(Te, Tj, Td);
Chris@82 201 rio[WS(vs, 3)] = FMA(T7, Tj, Tk);
Chris@82 202 }
Chris@82 203 {
Chris@82 204 E T1p, T1k, T1q, T1d, T1j;
Chris@82 205 T1p = T1l - T1o;
Chris@82 206 T1k = W[5];
Chris@82 207 T1q = T1k * T1i;
Chris@82 208 T1d = W[4];
Chris@82 209 T1j = T1d * T1i;
Chris@82 210 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T1k, T1p, T1j);
Chris@82 211 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T1d, T1p, T1q);
Chris@82 212 }
Chris@82 213 {
Chris@82 214 E T23, T22, T24, T1Z, T21;
Chris@82 215 T23 = T1T + T1W;
Chris@82 216 T22 = W[1];
Chris@82 217 T24 = T22 * T20;
Chris@82 218 T1Z = W[0];
Chris@82 219 T21 = T1Z * T20;
Chris@82 220 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T22, T23, T21);
Chris@82 221 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1Z, T23, T24);
Chris@82 222 }
Chris@82 223 {
Chris@82 224 E TX, TW, TY, TT, TV;
Chris@82 225 TX = TN + TQ;
Chris@82 226 TW = W[1];
Chris@82 227 TY = TW * TU;
Chris@82 228 TT = W[0];
Chris@82 229 TV = TT * TU;
Chris@82 230 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TW, TX, TV);
Chris@82 231 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TT, TX, TY);
Chris@82 232 }
Chris@82 233 {
Chris@82 234 E TR, TM, TS, TF, TL;
Chris@82 235 TR = TN - TQ;
Chris@82 236 TM = W[5];
Chris@82 237 TS = TM * TK;
Chris@82 238 TF = W[4];
Chris@82 239 TL = TF * TK;
Chris@82 240 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TM, TR, TL);
Chris@82 241 rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TR, TS);
Chris@82 242 }
Chris@82 243 {
Chris@82 244 E Tp, To, Tq, Tl, Tn;
Chris@82 245 Tp = Tf + Ti;
Chris@82 246 To = W[1];
Chris@82 247 Tq = To * Tm;
Chris@82 248 Tl = W[0];
Chris@82 249 Tn = Tl * Tm;
Chris@82 250 iio[WS(vs, 1)] = FNMS(To, Tp, Tn);
Chris@82 251 rio[WS(vs, 1)] = FMA(Tl, Tp, Tq);
Chris@82 252 }
Chris@82 253 {
Chris@82 254 E T1v, T1u, T1w, T1r, T1t;
Chris@82 255 T1v = T1l + T1o;
Chris@82 256 T1u = W[1];
Chris@82 257 T1w = T1u * T1s;
Chris@82 258 T1r = W[0];
Chris@82 259 T1t = T1r * T1s;
Chris@82 260 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1u, T1v, T1t);
Chris@82 261 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1r, T1v, T1w);
Chris@82 262 }
Chris@82 263 {
Chris@82 264 E T1X, T1S, T1Y, T1L, T1R;
Chris@82 265 T1X = T1T - T1W;
Chris@82 266 T1S = W[5];
Chris@82 267 T1Y = T1S * T1Q;
Chris@82 268 T1L = W[4];
Chris@82 269 T1R = T1L * T1Q;
Chris@82 270 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1S, T1X, T1R);
Chris@82 271 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1L, T1X, T1Y);
Chris@82 272 }
Chris@82 273 }
Chris@82 274 }
Chris@82 275 }
Chris@82 276
Chris@82 277 static const tw_instr twinstr[] = {
Chris@82 278 {TW_FULL, 0, 4},
Chris@82 279 {TW_NEXT, 1, 0}
Chris@82 280 };
Chris@82 281
Chris@82 282 static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, {64, 24, 24, 0}, 0, 0, 0 };
Chris@82 283
Chris@82 284 void X(codelet_q1_4) (planner *p) {
Chris@82 285 X(kdft_difsq_register) (p, q1_4, &desc);
Chris@82 286 }
Chris@82 287 #else
Chris@82 288
Chris@82 289 /* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
Chris@82 290
Chris@82 291 /*
Chris@82 292 * This function contains 88 FP additions, 48 FP multiplications,
Chris@82 293 * (or, 64 additions, 24 multiplications, 24 fused multiply/add),
Chris@82 294 * 37 stack variables, 0 constants, and 64 memory accesses
Chris@82 295 */
Chris@82 296 #include "dft/scalar/q.h"
Chris@82 297
Chris@82 298 static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 299 {
Chris@82 300 {
Chris@82 301 INT m;
Chris@82 302 for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@82 303 E T3, Te, Tb, Tq, T6, T8, Th, Tr, Tv, TG, TD, TS, Ty, TA, TJ;
Chris@82 304 E TT, TX, T18, T15, T1k, T10, T12, T1b, T1l, T1p, T1A, T1x, T1M, T1s, T1u;
Chris@82 305 E T1D, T1N;
Chris@82 306 {
Chris@82 307 E T1, T2, T9, Ta;
Chris@82 308 T1 = rio[0];
Chris@82 309 T2 = rio[WS(rs, 2)];
Chris@82 310 T3 = T1 + T2;
Chris@82 311 Te = T1 - T2;
Chris@82 312 T9 = iio[0];
Chris@82 313 Ta = iio[WS(rs, 2)];
Chris@82 314 Tb = T9 - Ta;
Chris@82 315 Tq = T9 + Ta;
Chris@82 316 }
Chris@82 317 {
Chris@82 318 E T4, T5, Tf, Tg;
Chris@82 319 T4 = rio[WS(rs, 1)];
Chris@82 320 T5 = rio[WS(rs, 3)];
Chris@82 321 T6 = T4 + T5;
Chris@82 322 T8 = T4 - T5;
Chris@82 323 Tf = iio[WS(rs, 1)];
Chris@82 324 Tg = iio[WS(rs, 3)];
Chris@82 325 Th = Tf - Tg;
Chris@82 326 Tr = Tf + Tg;
Chris@82 327 }
Chris@82 328 {
Chris@82 329 E Tt, Tu, TB, TC;
Chris@82 330 Tt = rio[WS(vs, 1)];
Chris@82 331 Tu = rio[WS(vs, 1) + WS(rs, 2)];
Chris@82 332 Tv = Tt + Tu;
Chris@82 333 TG = Tt - Tu;
Chris@82 334 TB = iio[WS(vs, 1)];
Chris@82 335 TC = iio[WS(vs, 1) + WS(rs, 2)];
Chris@82 336 TD = TB - TC;
Chris@82 337 TS = TB + TC;
Chris@82 338 }
Chris@82 339 {
Chris@82 340 E Tw, Tx, TH, TI;
Chris@82 341 Tw = rio[WS(vs, 1) + WS(rs, 1)];
Chris@82 342 Tx = rio[WS(vs, 1) + WS(rs, 3)];
Chris@82 343 Ty = Tw + Tx;
Chris@82 344 TA = Tw - Tx;
Chris@82 345 TH = iio[WS(vs, 1) + WS(rs, 1)];
Chris@82 346 TI = iio[WS(vs, 1) + WS(rs, 3)];
Chris@82 347 TJ = TH - TI;
Chris@82 348 TT = TH + TI;
Chris@82 349 }
Chris@82 350 {
Chris@82 351 E TV, TW, T13, T14;
Chris@82 352 TV = rio[WS(vs, 2)];
Chris@82 353 TW = rio[WS(vs, 2) + WS(rs, 2)];
Chris@82 354 TX = TV + TW;
Chris@82 355 T18 = TV - TW;
Chris@82 356 T13 = iio[WS(vs, 2)];
Chris@82 357 T14 = iio[WS(vs, 2) + WS(rs, 2)];
Chris@82 358 T15 = T13 - T14;
Chris@82 359 T1k = T13 + T14;
Chris@82 360 }
Chris@82 361 {
Chris@82 362 E TY, TZ, T19, T1a;
Chris@82 363 TY = rio[WS(vs, 2) + WS(rs, 1)];
Chris@82 364 TZ = rio[WS(vs, 2) + WS(rs, 3)];
Chris@82 365 T10 = TY + TZ;
Chris@82 366 T12 = TY - TZ;
Chris@82 367 T19 = iio[WS(vs, 2) + WS(rs, 1)];
Chris@82 368 T1a = iio[WS(vs, 2) + WS(rs, 3)];
Chris@82 369 T1b = T19 - T1a;
Chris@82 370 T1l = T19 + T1a;
Chris@82 371 }
Chris@82 372 {
Chris@82 373 E T1n, T1o, T1v, T1w;
Chris@82 374 T1n = rio[WS(vs, 3)];
Chris@82 375 T1o = rio[WS(vs, 3) + WS(rs, 2)];
Chris@82 376 T1p = T1n + T1o;
Chris@82 377 T1A = T1n - T1o;
Chris@82 378 T1v = iio[WS(vs, 3)];
Chris@82 379 T1w = iio[WS(vs, 3) + WS(rs, 2)];
Chris@82 380 T1x = T1v - T1w;
Chris@82 381 T1M = T1v + T1w;
Chris@82 382 }
Chris@82 383 {
Chris@82 384 E T1q, T1r, T1B, T1C;
Chris@82 385 T1q = rio[WS(vs, 3) + WS(rs, 1)];
Chris@82 386 T1r = rio[WS(vs, 3) + WS(rs, 3)];
Chris@82 387 T1s = T1q + T1r;
Chris@82 388 T1u = T1q - T1r;
Chris@82 389 T1B = iio[WS(vs, 3) + WS(rs, 1)];
Chris@82 390 T1C = iio[WS(vs, 3) + WS(rs, 3)];
Chris@82 391 T1D = T1B - T1C;
Chris@82 392 T1N = T1B + T1C;
Chris@82 393 }
Chris@82 394 rio[0] = T3 + T6;
Chris@82 395 iio[0] = Tq + Tr;
Chris@82 396 rio[WS(rs, 1)] = Tv + Ty;
Chris@82 397 iio[WS(rs, 1)] = TS + TT;
Chris@82 398 rio[WS(rs, 2)] = TX + T10;
Chris@82 399 iio[WS(rs, 2)] = T1k + T1l;
Chris@82 400 iio[WS(rs, 3)] = T1M + T1N;
Chris@82 401 rio[WS(rs, 3)] = T1p + T1s;
Chris@82 402 {
Chris@82 403 E Tc, Ti, T7, Td;
Chris@82 404 Tc = T8 + Tb;
Chris@82 405 Ti = Te - Th;
Chris@82 406 T7 = W[4];
Chris@82 407 Td = W[5];
Chris@82 408 iio[WS(vs, 3)] = FNMS(Td, Ti, T7 * Tc);
Chris@82 409 rio[WS(vs, 3)] = FMA(Td, Tc, T7 * Ti);
Chris@82 410 }
Chris@82 411 {
Chris@82 412 E T1K, T1O, T1J, T1L;
Chris@82 413 T1K = T1p - T1s;
Chris@82 414 T1O = T1M - T1N;
Chris@82 415 T1J = W[2];
Chris@82 416 T1L = W[3];
Chris@82 417 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T1J, T1K, T1L * T1O);
Chris@82 418 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T1L, T1K, T1J * T1O);
Chris@82 419 }
Chris@82 420 {
Chris@82 421 E Tk, Tm, Tj, Tl;
Chris@82 422 Tk = Tb - T8;
Chris@82 423 Tm = Te + Th;
Chris@82 424 Tj = W[0];
Chris@82 425 Tl = W[1];
Chris@82 426 iio[WS(vs, 1)] = FNMS(Tl, Tm, Tj * Tk);
Chris@82 427 rio[WS(vs, 1)] = FMA(Tl, Tk, Tj * Tm);
Chris@82 428 }
Chris@82 429 {
Chris@82 430 E To, Ts, Tn, Tp;
Chris@82 431 To = T3 - T6;
Chris@82 432 Ts = Tq - Tr;
Chris@82 433 Tn = W[2];
Chris@82 434 Tp = W[3];
Chris@82 435 rio[WS(vs, 2)] = FMA(Tn, To, Tp * Ts);
Chris@82 436 iio[WS(vs, 2)] = FNMS(Tp, To, Tn * Ts);
Chris@82 437 }
Chris@82 438 {
Chris@82 439 E T16, T1c, T11, T17;
Chris@82 440 T16 = T12 + T15;
Chris@82 441 T1c = T18 - T1b;
Chris@82 442 T11 = W[4];
Chris@82 443 T17 = W[5];
Chris@82 444 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T17, T1c, T11 * T16);
Chris@82 445 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T17, T16, T11 * T1c);
Chris@82 446 }
Chris@82 447 {
Chris@82 448 E T1G, T1I, T1F, T1H;
Chris@82 449 T1G = T1x - T1u;
Chris@82 450 T1I = T1A + T1D;
Chris@82 451 T1F = W[0];
Chris@82 452 T1H = W[1];
Chris@82 453 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T1H, T1I, T1F * T1G);
Chris@82 454 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1H, T1G, T1F * T1I);
Chris@82 455 }
Chris@82 456 {
Chris@82 457 E TQ, TU, TP, TR;
Chris@82 458 TQ = Tv - Ty;
Chris@82 459 TU = TS - TT;
Chris@82 460 TP = W[2];
Chris@82 461 TR = W[3];
Chris@82 462 rio[WS(vs, 2) + WS(rs, 1)] = FMA(TP, TQ, TR * TU);
Chris@82 463 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TR, TQ, TP * TU);
Chris@82 464 }
Chris@82 465 {
Chris@82 466 E T1e, T1g, T1d, T1f;
Chris@82 467 T1e = T15 - T12;
Chris@82 468 T1g = T18 + T1b;
Chris@82 469 T1d = W[0];
Chris@82 470 T1f = W[1];
Chris@82 471 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
Chris@82 472 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
Chris@82 473 }
Chris@82 474 {
Chris@82 475 E T1i, T1m, T1h, T1j;
Chris@82 476 T1i = TX - T10;
Chris@82 477 T1m = T1k - T1l;
Chris@82 478 T1h = W[2];
Chris@82 479 T1j = W[3];
Chris@82 480 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1h, T1i, T1j * T1m);
Chris@82 481 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1j, T1i, T1h * T1m);
Chris@82 482 }
Chris@82 483 {
Chris@82 484 E T1y, T1E, T1t, T1z;
Chris@82 485 T1y = T1u + T1x;
Chris@82 486 T1E = T1A - T1D;
Chris@82 487 T1t = W[4];
Chris@82 488 T1z = W[5];
Chris@82 489 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1z, T1E, T1t * T1y);
Chris@82 490 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1z, T1y, T1t * T1E);
Chris@82 491 }
Chris@82 492 {
Chris@82 493 E TM, TO, TL, TN;
Chris@82 494 TM = TD - TA;
Chris@82 495 TO = TG + TJ;
Chris@82 496 TL = W[0];
Chris@82 497 TN = W[1];
Chris@82 498 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TN, TO, TL * TM);
Chris@82 499 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TN, TM, TL * TO);
Chris@82 500 }
Chris@82 501 {
Chris@82 502 E TE, TK, Tz, TF;
Chris@82 503 TE = TA + TD;
Chris@82 504 TK = TG - TJ;
Chris@82 505 Tz = W[4];
Chris@82 506 TF = W[5];
Chris@82 507 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TF, TK, Tz * TE);
Chris@82 508 rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TE, Tz * TK);
Chris@82 509 }
Chris@82 510 }
Chris@82 511 }
Chris@82 512 }
Chris@82 513
Chris@82 514 static const tw_instr twinstr[] = {
Chris@82 515 {TW_FULL, 0, 4},
Chris@82 516 {TW_NEXT, 1, 0}
Chris@82 517 };
Chris@82 518
Chris@82 519 static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, {64, 24, 24, 0}, 0, 0, 0 };
Chris@82 520
Chris@82 521 void X(codelet_q1_4) (planner *p) {
Chris@82 522 X(kdft_difsq_register) (p, q1_4, &desc);
Chris@82 523 }
Chris@82 524 #endif