annotate src/fftw-3.3.8/dft/scalar/codelets/q1_5.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:30 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 200 FP additions, 170 FP multiplications,
Chris@82 32 * (or, 70 additions, 40 multiplications, 130 fused multiply/add),
Chris@82 33 * 75 stack variables, 4 constants, and 100 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/q.h"
Chris@82 36
Chris@82 37 static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@82 46 E T1, Tb, TM, Tw, T8, Ta, Tn, Tj, TH, Ts, Tq, Tr, TV, T15, T1G;
Chris@82 47 E T1q, T12, T14, T1h, T1d, T1B, T1m, T1k, T1l, T1P, T1Z, T2A, T2k, T1W, T1Y;
Chris@82 48 E T2b, T27, T2v, T2g, T2e, T2f, T3Z, T3V, T4j, T44, T42, T43, T3D, T3N, T4o;
Chris@82 49 E T48, T3K, T3M, T2J, T2T, T3u, T3e, T2Q, T2S, T35, T31, T3p, T3a, T38, T39;
Chris@82 50 {
Chris@82 51 E T7, Tv, T4, Tu;
Chris@82 52 T1 = rio[0];
Chris@82 53 {
Chris@82 54 E T5, T6, T2, T3;
Chris@82 55 T5 = rio[WS(rs, 2)];
Chris@82 56 T6 = rio[WS(rs, 3)];
Chris@82 57 T7 = T5 + T6;
Chris@82 58 Tv = T5 - T6;
Chris@82 59 T2 = rio[WS(rs, 1)];
Chris@82 60 T3 = rio[WS(rs, 4)];
Chris@82 61 T4 = T2 + T3;
Chris@82 62 Tu = T2 - T3;
Chris@82 63 }
Chris@82 64 Tb = T4 - T7;
Chris@82 65 TM = FNMS(KP618033988, Tu, Tv);
Chris@82 66 Tw = FMA(KP618033988, Tv, Tu);
Chris@82 67 T8 = T4 + T7;
Chris@82 68 Ta = FNMS(KP250000000, T8, T1);
Chris@82 69 }
Chris@82 70 {
Chris@82 71 E Ti, Tp, Tf, To;
Chris@82 72 Tn = iio[0];
Chris@82 73 {
Chris@82 74 E Tg, Th, Td, Te;
Chris@82 75 Tg = iio[WS(rs, 2)];
Chris@82 76 Th = iio[WS(rs, 3)];
Chris@82 77 Ti = Tg - Th;
Chris@82 78 Tp = Tg + Th;
Chris@82 79 Td = iio[WS(rs, 1)];
Chris@82 80 Te = iio[WS(rs, 4)];
Chris@82 81 Tf = Td - Te;
Chris@82 82 To = Td + Te;
Chris@82 83 }
Chris@82 84 Tj = FMA(KP618033988, Ti, Tf);
Chris@82 85 TH = FNMS(KP618033988, Tf, Ti);
Chris@82 86 Ts = To - Tp;
Chris@82 87 Tq = To + Tp;
Chris@82 88 Tr = FNMS(KP250000000, Tq, Tn);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 E T11, T1p, TY, T1o;
Chris@82 92 TV = rio[WS(vs, 1)];
Chris@82 93 {
Chris@82 94 E TZ, T10, TW, TX;
Chris@82 95 TZ = rio[WS(vs, 1) + WS(rs, 2)];
Chris@82 96 T10 = rio[WS(vs, 1) + WS(rs, 3)];
Chris@82 97 T11 = TZ + T10;
Chris@82 98 T1p = TZ - T10;
Chris@82 99 TW = rio[WS(vs, 1) + WS(rs, 1)];
Chris@82 100 TX = rio[WS(vs, 1) + WS(rs, 4)];
Chris@82 101 TY = TW + TX;
Chris@82 102 T1o = TW - TX;
Chris@82 103 }
Chris@82 104 T15 = TY - T11;
Chris@82 105 T1G = FNMS(KP618033988, T1o, T1p);
Chris@82 106 T1q = FMA(KP618033988, T1p, T1o);
Chris@82 107 T12 = TY + T11;
Chris@82 108 T14 = FNMS(KP250000000, T12, TV);
Chris@82 109 }
Chris@82 110 {
Chris@82 111 E T1c, T1j, T19, T1i;
Chris@82 112 T1h = iio[WS(vs, 1)];
Chris@82 113 {
Chris@82 114 E T1a, T1b, T17, T18;
Chris@82 115 T1a = iio[WS(vs, 1) + WS(rs, 2)];
Chris@82 116 T1b = iio[WS(vs, 1) + WS(rs, 3)];
Chris@82 117 T1c = T1a - T1b;
Chris@82 118 T1j = T1a + T1b;
Chris@82 119 T17 = iio[WS(vs, 1) + WS(rs, 1)];
Chris@82 120 T18 = iio[WS(vs, 1) + WS(rs, 4)];
Chris@82 121 T19 = T17 - T18;
Chris@82 122 T1i = T17 + T18;
Chris@82 123 }
Chris@82 124 T1d = FMA(KP618033988, T1c, T19);
Chris@82 125 T1B = FNMS(KP618033988, T19, T1c);
Chris@82 126 T1m = T1i - T1j;
Chris@82 127 T1k = T1i + T1j;
Chris@82 128 T1l = FNMS(KP250000000, T1k, T1h);
Chris@82 129 }
Chris@82 130 {
Chris@82 131 E T1V, T2j, T1S, T2i;
Chris@82 132 T1P = rio[WS(vs, 2)];
Chris@82 133 {
Chris@82 134 E T1T, T1U, T1Q, T1R;
Chris@82 135 T1T = rio[WS(vs, 2) + WS(rs, 2)];
Chris@82 136 T1U = rio[WS(vs, 2) + WS(rs, 3)];
Chris@82 137 T1V = T1T + T1U;
Chris@82 138 T2j = T1T - T1U;
Chris@82 139 T1Q = rio[WS(vs, 2) + WS(rs, 1)];
Chris@82 140 T1R = rio[WS(vs, 2) + WS(rs, 4)];
Chris@82 141 T1S = T1Q + T1R;
Chris@82 142 T2i = T1Q - T1R;
Chris@82 143 }
Chris@82 144 T1Z = T1S - T1V;
Chris@82 145 T2A = FNMS(KP618033988, T2i, T2j);
Chris@82 146 T2k = FMA(KP618033988, T2j, T2i);
Chris@82 147 T1W = T1S + T1V;
Chris@82 148 T1Y = FNMS(KP250000000, T1W, T1P);
Chris@82 149 }
Chris@82 150 {
Chris@82 151 E T26, T2d, T23, T2c;
Chris@82 152 T2b = iio[WS(vs, 2)];
Chris@82 153 {
Chris@82 154 E T24, T25, T21, T22;
Chris@82 155 T24 = iio[WS(vs, 2) + WS(rs, 2)];
Chris@82 156 T25 = iio[WS(vs, 2) + WS(rs, 3)];
Chris@82 157 T26 = T24 - T25;
Chris@82 158 T2d = T24 + T25;
Chris@82 159 T21 = iio[WS(vs, 2) + WS(rs, 1)];
Chris@82 160 T22 = iio[WS(vs, 2) + WS(rs, 4)];
Chris@82 161 T23 = T21 - T22;
Chris@82 162 T2c = T21 + T22;
Chris@82 163 }
Chris@82 164 T27 = FMA(KP618033988, T26, T23);
Chris@82 165 T2v = FNMS(KP618033988, T23, T26);
Chris@82 166 T2g = T2c - T2d;
Chris@82 167 T2e = T2c + T2d;
Chris@82 168 T2f = FNMS(KP250000000, T2e, T2b);
Chris@82 169 }
Chris@82 170 {
Chris@82 171 E T3U, T41, T3R, T40;
Chris@82 172 T3Z = iio[WS(vs, 4)];
Chris@82 173 {
Chris@82 174 E T3S, T3T, T3P, T3Q;
Chris@82 175 T3S = iio[WS(vs, 4) + WS(rs, 2)];
Chris@82 176 T3T = iio[WS(vs, 4) + WS(rs, 3)];
Chris@82 177 T3U = T3S - T3T;
Chris@82 178 T41 = T3S + T3T;
Chris@82 179 T3P = iio[WS(vs, 4) + WS(rs, 1)];
Chris@82 180 T3Q = iio[WS(vs, 4) + WS(rs, 4)];
Chris@82 181 T3R = T3P - T3Q;
Chris@82 182 T40 = T3P + T3Q;
Chris@82 183 }
Chris@82 184 T3V = FMA(KP618033988, T3U, T3R);
Chris@82 185 T4j = FNMS(KP618033988, T3R, T3U);
Chris@82 186 T44 = T40 - T41;
Chris@82 187 T42 = T40 + T41;
Chris@82 188 T43 = FNMS(KP250000000, T42, T3Z);
Chris@82 189 }
Chris@82 190 {
Chris@82 191 E T3J, T47, T3G, T46;
Chris@82 192 T3D = rio[WS(vs, 4)];
Chris@82 193 {
Chris@82 194 E T3H, T3I, T3E, T3F;
Chris@82 195 T3H = rio[WS(vs, 4) + WS(rs, 2)];
Chris@82 196 T3I = rio[WS(vs, 4) + WS(rs, 3)];
Chris@82 197 T3J = T3H + T3I;
Chris@82 198 T47 = T3H - T3I;
Chris@82 199 T3E = rio[WS(vs, 4) + WS(rs, 1)];
Chris@82 200 T3F = rio[WS(vs, 4) + WS(rs, 4)];
Chris@82 201 T3G = T3E + T3F;
Chris@82 202 T46 = T3E - T3F;
Chris@82 203 }
Chris@82 204 T3N = T3G - T3J;
Chris@82 205 T4o = FNMS(KP618033988, T46, T47);
Chris@82 206 T48 = FMA(KP618033988, T47, T46);
Chris@82 207 T3K = T3G + T3J;
Chris@82 208 T3M = FNMS(KP250000000, T3K, T3D);
Chris@82 209 }
Chris@82 210 {
Chris@82 211 E T2P, T3d, T2M, T3c;
Chris@82 212 T2J = rio[WS(vs, 3)];
Chris@82 213 {
Chris@82 214 E T2N, T2O, T2K, T2L;
Chris@82 215 T2N = rio[WS(vs, 3) + WS(rs, 2)];
Chris@82 216 T2O = rio[WS(vs, 3) + WS(rs, 3)];
Chris@82 217 T2P = T2N + T2O;
Chris@82 218 T3d = T2N - T2O;
Chris@82 219 T2K = rio[WS(vs, 3) + WS(rs, 1)];
Chris@82 220 T2L = rio[WS(vs, 3) + WS(rs, 4)];
Chris@82 221 T2M = T2K + T2L;
Chris@82 222 T3c = T2K - T2L;
Chris@82 223 }
Chris@82 224 T2T = T2M - T2P;
Chris@82 225 T3u = FNMS(KP618033988, T3c, T3d);
Chris@82 226 T3e = FMA(KP618033988, T3d, T3c);
Chris@82 227 T2Q = T2M + T2P;
Chris@82 228 T2S = FNMS(KP250000000, T2Q, T2J);
Chris@82 229 }
Chris@82 230 {
Chris@82 231 E T30, T37, T2X, T36;
Chris@82 232 T35 = iio[WS(vs, 3)];
Chris@82 233 {
Chris@82 234 E T2Y, T2Z, T2V, T2W;
Chris@82 235 T2Y = iio[WS(vs, 3) + WS(rs, 2)];
Chris@82 236 T2Z = iio[WS(vs, 3) + WS(rs, 3)];
Chris@82 237 T30 = T2Y - T2Z;
Chris@82 238 T37 = T2Y + T2Z;
Chris@82 239 T2V = iio[WS(vs, 3) + WS(rs, 1)];
Chris@82 240 T2W = iio[WS(vs, 3) + WS(rs, 4)];
Chris@82 241 T2X = T2V - T2W;
Chris@82 242 T36 = T2V + T2W;
Chris@82 243 }
Chris@82 244 T31 = FMA(KP618033988, T30, T2X);
Chris@82 245 T3p = FNMS(KP618033988, T2X, T30);
Chris@82 246 T3a = T36 - T37;
Chris@82 247 T38 = T36 + T37;
Chris@82 248 T39 = FNMS(KP250000000, T38, T35);
Chris@82 249 }
Chris@82 250 rio[0] = T1 + T8;
Chris@82 251 iio[0] = Tn + Tq;
Chris@82 252 rio[WS(rs, 1)] = TV + T12;
Chris@82 253 iio[WS(rs, 1)] = T1h + T1k;
Chris@82 254 rio[WS(rs, 2)] = T1P + T1W;
Chris@82 255 iio[WS(rs, 2)] = T2b + T2e;
Chris@82 256 iio[WS(rs, 4)] = T3Z + T42;
Chris@82 257 rio[WS(rs, 4)] = T3D + T3K;
Chris@82 258 rio[WS(rs, 3)] = T2J + T2Q;
Chris@82 259 iio[WS(rs, 3)] = T35 + T38;
Chris@82 260 {
Chris@82 261 E Tk, TA, Tx, TD, Tc, Tt;
Chris@82 262 Tc = FMA(KP559016994, Tb, Ta);
Chris@82 263 Tk = FMA(KP951056516, Tj, Tc);
Chris@82 264 TA = FNMS(KP951056516, Tj, Tc);
Chris@82 265 Tt = FMA(KP559016994, Ts, Tr);
Chris@82 266 Tx = FNMS(KP951056516, Tw, Tt);
Chris@82 267 TD = FMA(KP951056516, Tw, Tt);
Chris@82 268 {
Chris@82 269 E Tl, Ty, T9, Tm;
Chris@82 270 T9 = W[0];
Chris@82 271 Tl = T9 * Tk;
Chris@82 272 Ty = T9 * Tx;
Chris@82 273 Tm = W[1];
Chris@82 274 rio[WS(vs, 1)] = FMA(Tm, Tx, Tl);
Chris@82 275 iio[WS(vs, 1)] = FNMS(Tm, Tk, Ty);
Chris@82 276 }
Chris@82 277 {
Chris@82 278 E TB, TE, Tz, TC;
Chris@82 279 Tz = W[6];
Chris@82 280 TB = Tz * TA;
Chris@82 281 TE = Tz * TD;
Chris@82 282 TC = W[7];
Chris@82 283 rio[WS(vs, 4)] = FMA(TC, TD, TB);
Chris@82 284 iio[WS(vs, 4)] = FNMS(TC, TA, TE);
Chris@82 285 }
Chris@82 286 }
Chris@82 287 {
Chris@82 288 E TI, TQ, TN, TT, TG, TL;
Chris@82 289 TG = FNMS(KP559016994, Tb, Ta);
Chris@82 290 TI = FNMS(KP951056516, TH, TG);
Chris@82 291 TQ = FMA(KP951056516, TH, TG);
Chris@82 292 TL = FNMS(KP559016994, Ts, Tr);
Chris@82 293 TN = FMA(KP951056516, TM, TL);
Chris@82 294 TT = FNMS(KP951056516, TM, TL);
Chris@82 295 {
Chris@82 296 E TJ, TO, TF, TK;
Chris@82 297 TF = W[2];
Chris@82 298 TJ = TF * TI;
Chris@82 299 TO = TF * TN;
Chris@82 300 TK = W[3];
Chris@82 301 rio[WS(vs, 2)] = FMA(TK, TN, TJ);
Chris@82 302 iio[WS(vs, 2)] = FNMS(TK, TI, TO);
Chris@82 303 }
Chris@82 304 {
Chris@82 305 E TR, TU, TP, TS;
Chris@82 306 TP = W[4];
Chris@82 307 TR = TP * TQ;
Chris@82 308 TU = TP * TT;
Chris@82 309 TS = W[5];
Chris@82 310 rio[WS(vs, 3)] = FMA(TS, TT, TR);
Chris@82 311 iio[WS(vs, 3)] = FNMS(TS, TQ, TU);
Chris@82 312 }
Chris@82 313 }
Chris@82 314 {
Chris@82 315 E T2w, T2E, T2B, T2H, T2u, T2z;
Chris@82 316 T2u = FNMS(KP559016994, T1Z, T1Y);
Chris@82 317 T2w = FNMS(KP951056516, T2v, T2u);
Chris@82 318 T2E = FMA(KP951056516, T2v, T2u);
Chris@82 319 T2z = FNMS(KP559016994, T2g, T2f);
Chris@82 320 T2B = FMA(KP951056516, T2A, T2z);
Chris@82 321 T2H = FNMS(KP951056516, T2A, T2z);
Chris@82 322 {
Chris@82 323 E T2x, T2C, T2t, T2y;
Chris@82 324 T2t = W[2];
Chris@82 325 T2x = T2t * T2w;
Chris@82 326 T2C = T2t * T2B;
Chris@82 327 T2y = W[3];
Chris@82 328 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2y, T2B, T2x);
Chris@82 329 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2y, T2w, T2C);
Chris@82 330 }
Chris@82 331 {
Chris@82 332 E T2F, T2I, T2D, T2G;
Chris@82 333 T2D = W[4];
Chris@82 334 T2F = T2D * T2E;
Chris@82 335 T2I = T2D * T2H;
Chris@82 336 T2G = W[5];
Chris@82 337 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2G, T2H, T2F);
Chris@82 338 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2G, T2E, T2I);
Chris@82 339 }
Chris@82 340 }
Chris@82 341 {
Chris@82 342 E T4k, T4s, T4p, T4v, T4i, T4n;
Chris@82 343 T4i = FNMS(KP559016994, T3N, T3M);
Chris@82 344 T4k = FNMS(KP951056516, T4j, T4i);
Chris@82 345 T4s = FMA(KP951056516, T4j, T4i);
Chris@82 346 T4n = FNMS(KP559016994, T44, T43);
Chris@82 347 T4p = FMA(KP951056516, T4o, T4n);
Chris@82 348 T4v = FNMS(KP951056516, T4o, T4n);
Chris@82 349 {
Chris@82 350 E T4l, T4q, T4h, T4m;
Chris@82 351 T4h = W[2];
Chris@82 352 T4l = T4h * T4k;
Chris@82 353 T4q = T4h * T4p;
Chris@82 354 T4m = W[3];
Chris@82 355 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4m, T4p, T4l);
Chris@82 356 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4m, T4k, T4q);
Chris@82 357 }
Chris@82 358 {
Chris@82 359 E T4t, T4w, T4r, T4u;
Chris@82 360 T4r = W[4];
Chris@82 361 T4t = T4r * T4s;
Chris@82 362 T4w = T4r * T4v;
Chris@82 363 T4u = W[5];
Chris@82 364 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4u, T4v, T4t);
Chris@82 365 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4u, T4s, T4w);
Chris@82 366 }
Chris@82 367 }
Chris@82 368 {
Chris@82 369 E T28, T2o, T2l, T2r, T20, T2h;
Chris@82 370 T20 = FMA(KP559016994, T1Z, T1Y);
Chris@82 371 T28 = FMA(KP951056516, T27, T20);
Chris@82 372 T2o = FNMS(KP951056516, T27, T20);
Chris@82 373 T2h = FMA(KP559016994, T2g, T2f);
Chris@82 374 T2l = FNMS(KP951056516, T2k, T2h);
Chris@82 375 T2r = FMA(KP951056516, T2k, T2h);
Chris@82 376 {
Chris@82 377 E T29, T2m, T1X, T2a;
Chris@82 378 T1X = W[0];
Chris@82 379 T29 = T1X * T28;
Chris@82 380 T2m = T1X * T2l;
Chris@82 381 T2a = W[1];
Chris@82 382 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2a, T2l, T29);
Chris@82 383 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2a, T28, T2m);
Chris@82 384 }
Chris@82 385 {
Chris@82 386 E T2p, T2s, T2n, T2q;
Chris@82 387 T2n = W[6];
Chris@82 388 T2p = T2n * T2o;
Chris@82 389 T2s = T2n * T2r;
Chris@82 390 T2q = W[7];
Chris@82 391 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2q, T2r, T2p);
Chris@82 392 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2q, T2o, T2s);
Chris@82 393 }
Chris@82 394 }
Chris@82 395 {
Chris@82 396 E T32, T3i, T3f, T3l, T2U, T3b;
Chris@82 397 T2U = FMA(KP559016994, T2T, T2S);
Chris@82 398 T32 = FMA(KP951056516, T31, T2U);
Chris@82 399 T3i = FNMS(KP951056516, T31, T2U);
Chris@82 400 T3b = FMA(KP559016994, T3a, T39);
Chris@82 401 T3f = FNMS(KP951056516, T3e, T3b);
Chris@82 402 T3l = FMA(KP951056516, T3e, T3b);
Chris@82 403 {
Chris@82 404 E T33, T3g, T2R, T34;
Chris@82 405 T2R = W[0];
Chris@82 406 T33 = T2R * T32;
Chris@82 407 T3g = T2R * T3f;
Chris@82 408 T34 = W[1];
Chris@82 409 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T34, T3f, T33);
Chris@82 410 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T34, T32, T3g);
Chris@82 411 }
Chris@82 412 {
Chris@82 413 E T3j, T3m, T3h, T3k;
Chris@82 414 T3h = W[6];
Chris@82 415 T3j = T3h * T3i;
Chris@82 416 T3m = T3h * T3l;
Chris@82 417 T3k = W[7];
Chris@82 418 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3k, T3l, T3j);
Chris@82 419 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3k, T3i, T3m);
Chris@82 420 }
Chris@82 421 }
Chris@82 422 {
Chris@82 423 E T3q, T3y, T3v, T3B, T3o, T3t;
Chris@82 424 T3o = FNMS(KP559016994, T2T, T2S);
Chris@82 425 T3q = FNMS(KP951056516, T3p, T3o);
Chris@82 426 T3y = FMA(KP951056516, T3p, T3o);
Chris@82 427 T3t = FNMS(KP559016994, T3a, T39);
Chris@82 428 T3v = FMA(KP951056516, T3u, T3t);
Chris@82 429 T3B = FNMS(KP951056516, T3u, T3t);
Chris@82 430 {
Chris@82 431 E T3r, T3w, T3n, T3s;
Chris@82 432 T3n = W[2];
Chris@82 433 T3r = T3n * T3q;
Chris@82 434 T3w = T3n * T3v;
Chris@82 435 T3s = W[3];
Chris@82 436 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3s, T3v, T3r);
Chris@82 437 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3s, T3q, T3w);
Chris@82 438 }
Chris@82 439 {
Chris@82 440 E T3z, T3C, T3x, T3A;
Chris@82 441 T3x = W[4];
Chris@82 442 T3z = T3x * T3y;
Chris@82 443 T3C = T3x * T3B;
Chris@82 444 T3A = W[5];
Chris@82 445 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3A, T3B, T3z);
Chris@82 446 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3A, T3y, T3C);
Chris@82 447 }
Chris@82 448 }
Chris@82 449 {
Chris@82 450 E T3W, T4c, T49, T4f, T3O, T45;
Chris@82 451 T3O = FMA(KP559016994, T3N, T3M);
Chris@82 452 T3W = FMA(KP951056516, T3V, T3O);
Chris@82 453 T4c = FNMS(KP951056516, T3V, T3O);
Chris@82 454 T45 = FMA(KP559016994, T44, T43);
Chris@82 455 T49 = FNMS(KP951056516, T48, T45);
Chris@82 456 T4f = FMA(KP951056516, T48, T45);
Chris@82 457 {
Chris@82 458 E T3X, T4a, T3L, T3Y;
Chris@82 459 T3L = W[0];
Chris@82 460 T3X = T3L * T3W;
Chris@82 461 T4a = T3L * T49;
Chris@82 462 T3Y = W[1];
Chris@82 463 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3Y, T49, T3X);
Chris@82 464 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3Y, T3W, T4a);
Chris@82 465 }
Chris@82 466 {
Chris@82 467 E T4d, T4g, T4b, T4e;
Chris@82 468 T4b = W[6];
Chris@82 469 T4d = T4b * T4c;
Chris@82 470 T4g = T4b * T4f;
Chris@82 471 T4e = W[7];
Chris@82 472 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4e, T4f, T4d);
Chris@82 473 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4e, T4c, T4g);
Chris@82 474 }
Chris@82 475 }
Chris@82 476 {
Chris@82 477 E T1C, T1K, T1H, T1N, T1A, T1F;
Chris@82 478 T1A = FNMS(KP559016994, T15, T14);
Chris@82 479 T1C = FNMS(KP951056516, T1B, T1A);
Chris@82 480 T1K = FMA(KP951056516, T1B, T1A);
Chris@82 481 T1F = FNMS(KP559016994, T1m, T1l);
Chris@82 482 T1H = FMA(KP951056516, T1G, T1F);
Chris@82 483 T1N = FNMS(KP951056516, T1G, T1F);
Chris@82 484 {
Chris@82 485 E T1D, T1I, T1z, T1E;
Chris@82 486 T1z = W[2];
Chris@82 487 T1D = T1z * T1C;
Chris@82 488 T1I = T1z * T1H;
Chris@82 489 T1E = W[3];
Chris@82 490 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1E, T1H, T1D);
Chris@82 491 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1E, T1C, T1I);
Chris@82 492 }
Chris@82 493 {
Chris@82 494 E T1L, T1O, T1J, T1M;
Chris@82 495 T1J = W[4];
Chris@82 496 T1L = T1J * T1K;
Chris@82 497 T1O = T1J * T1N;
Chris@82 498 T1M = W[5];
Chris@82 499 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
Chris@82 500 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
Chris@82 501 }
Chris@82 502 }
Chris@82 503 {
Chris@82 504 E T1e, T1u, T1r, T1x, T16, T1n;
Chris@82 505 T16 = FMA(KP559016994, T15, T14);
Chris@82 506 T1e = FMA(KP951056516, T1d, T16);
Chris@82 507 T1u = FNMS(KP951056516, T1d, T16);
Chris@82 508 T1n = FMA(KP559016994, T1m, T1l);
Chris@82 509 T1r = FNMS(KP951056516, T1q, T1n);
Chris@82 510 T1x = FMA(KP951056516, T1q, T1n);
Chris@82 511 {
Chris@82 512 E T1f, T1s, T13, T1g;
Chris@82 513 T13 = W[0];
Chris@82 514 T1f = T13 * T1e;
Chris@82 515 T1s = T13 * T1r;
Chris@82 516 T1g = W[1];
Chris@82 517 rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1g, T1r, T1f);
Chris@82 518 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1g, T1e, T1s);
Chris@82 519 }
Chris@82 520 {
Chris@82 521 E T1v, T1y, T1t, T1w;
Chris@82 522 T1t = W[6];
Chris@82 523 T1v = T1t * T1u;
Chris@82 524 T1y = T1t * T1x;
Chris@82 525 T1w = W[7];
Chris@82 526 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1w, T1x, T1v);
Chris@82 527 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1w, T1u, T1y);
Chris@82 528 }
Chris@82 529 }
Chris@82 530 }
Chris@82 531 }
Chris@82 532 }
Chris@82 533
Chris@82 534 static const tw_instr twinstr[] = {
Chris@82 535 {TW_FULL, 0, 5},
Chris@82 536 {TW_NEXT, 1, 0}
Chris@82 537 };
Chris@82 538
Chris@82 539 static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, {70, 40, 130, 0}, 0, 0, 0 };
Chris@82 540
Chris@82 541 void X(codelet_q1_5) (planner *p) {
Chris@82 542 X(kdft_difsq_register) (p, q1_5, &desc);
Chris@82 543 }
Chris@82 544 #else
Chris@82 545
Chris@82 546 /* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
Chris@82 547
Chris@82 548 /*
Chris@82 549 * This function contains 200 FP additions, 140 FP multiplications,
Chris@82 550 * (or, 130 additions, 70 multiplications, 70 fused multiply/add),
Chris@82 551 * 75 stack variables, 4 constants, and 100 memory accesses
Chris@82 552 */
Chris@82 553 #include "dft/scalar/q.h"
Chris@82 554
Chris@82 555 static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 556 {
Chris@82 557 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 558 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 559 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 560 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 561 {
Chris@82 562 INT m;
Chris@82 563 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@82 564 E T1, Ta, TG, Tv, T8, Tb, Tp, Tj, TD, To, Tq, Tr, TN, TW, T1s;
Chris@82 565 E T1h, TU, TX, T1b, T15, T1p, T1a, T1c, T1d, T1z, T1I, T2e, T23, T1G, T1J;
Chris@82 566 E T1X, T1R, T2b, T1W, T1Y, T1Z, T3v, T3p, T3J, T3u, T3w, T3x, T37, T3g, T3M;
Chris@82 567 E T3B, T3e, T3h, T2l, T2u, T30, T2P, T2s, T2v, T2J, T2D, T2X, T2I, T2K, T2L;
Chris@82 568 {
Chris@82 569 E T7, Tu, T4, Tt;
Chris@82 570 T1 = rio[0];
Chris@82 571 {
Chris@82 572 E T5, T6, T2, T3;
Chris@82 573 T5 = rio[WS(rs, 2)];
Chris@82 574 T6 = rio[WS(rs, 3)];
Chris@82 575 T7 = T5 + T6;
Chris@82 576 Tu = T5 - T6;
Chris@82 577 T2 = rio[WS(rs, 1)];
Chris@82 578 T3 = rio[WS(rs, 4)];
Chris@82 579 T4 = T2 + T3;
Chris@82 580 Tt = T2 - T3;
Chris@82 581 }
Chris@82 582 Ta = KP559016994 * (T4 - T7);
Chris@82 583 TG = FNMS(KP587785252, Tt, KP951056516 * Tu);
Chris@82 584 Tv = FMA(KP951056516, Tt, KP587785252 * Tu);
Chris@82 585 T8 = T4 + T7;
Chris@82 586 Tb = FNMS(KP250000000, T8, T1);
Chris@82 587 }
Chris@82 588 {
Chris@82 589 E Ti, Tn, Tf, Tm;
Chris@82 590 Tp = iio[0];
Chris@82 591 {
Chris@82 592 E Tg, Th, Td, Te;
Chris@82 593 Tg = iio[WS(rs, 2)];
Chris@82 594 Th = iio[WS(rs, 3)];
Chris@82 595 Ti = Tg - Th;
Chris@82 596 Tn = Tg + Th;
Chris@82 597 Td = iio[WS(rs, 1)];
Chris@82 598 Te = iio[WS(rs, 4)];
Chris@82 599 Tf = Td - Te;
Chris@82 600 Tm = Td + Te;
Chris@82 601 }
Chris@82 602 Tj = FMA(KP951056516, Tf, KP587785252 * Ti);
Chris@82 603 TD = FNMS(KP587785252, Tf, KP951056516 * Ti);
Chris@82 604 To = KP559016994 * (Tm - Tn);
Chris@82 605 Tq = Tm + Tn;
Chris@82 606 Tr = FNMS(KP250000000, Tq, Tp);
Chris@82 607 }
Chris@82 608 {
Chris@82 609 E TT, T1g, TQ, T1f;
Chris@82 610 TN = rio[WS(vs, 1)];
Chris@82 611 {
Chris@82 612 E TR, TS, TO, TP;
Chris@82 613 TR = rio[WS(vs, 1) + WS(rs, 2)];
Chris@82 614 TS = rio[WS(vs, 1) + WS(rs, 3)];
Chris@82 615 TT = TR + TS;
Chris@82 616 T1g = TR - TS;
Chris@82 617 TO = rio[WS(vs, 1) + WS(rs, 1)];
Chris@82 618 TP = rio[WS(vs, 1) + WS(rs, 4)];
Chris@82 619 TQ = TO + TP;
Chris@82 620 T1f = TO - TP;
Chris@82 621 }
Chris@82 622 TW = KP559016994 * (TQ - TT);
Chris@82 623 T1s = FNMS(KP587785252, T1f, KP951056516 * T1g);
Chris@82 624 T1h = FMA(KP951056516, T1f, KP587785252 * T1g);
Chris@82 625 TU = TQ + TT;
Chris@82 626 TX = FNMS(KP250000000, TU, TN);
Chris@82 627 }
Chris@82 628 {
Chris@82 629 E T14, T19, T11, T18;
Chris@82 630 T1b = iio[WS(vs, 1)];
Chris@82 631 {
Chris@82 632 E T12, T13, TZ, T10;
Chris@82 633 T12 = iio[WS(vs, 1) + WS(rs, 2)];
Chris@82 634 T13 = iio[WS(vs, 1) + WS(rs, 3)];
Chris@82 635 T14 = T12 - T13;
Chris@82 636 T19 = T12 + T13;
Chris@82 637 TZ = iio[WS(vs, 1) + WS(rs, 1)];
Chris@82 638 T10 = iio[WS(vs, 1) + WS(rs, 4)];
Chris@82 639 T11 = TZ - T10;
Chris@82 640 T18 = TZ + T10;
Chris@82 641 }
Chris@82 642 T15 = FMA(KP951056516, T11, KP587785252 * T14);
Chris@82 643 T1p = FNMS(KP587785252, T11, KP951056516 * T14);
Chris@82 644 T1a = KP559016994 * (T18 - T19);
Chris@82 645 T1c = T18 + T19;
Chris@82 646 T1d = FNMS(KP250000000, T1c, T1b);
Chris@82 647 }
Chris@82 648 {
Chris@82 649 E T1F, T22, T1C, T21;
Chris@82 650 T1z = rio[WS(vs, 2)];
Chris@82 651 {
Chris@82 652 E T1D, T1E, T1A, T1B;
Chris@82 653 T1D = rio[WS(vs, 2) + WS(rs, 2)];
Chris@82 654 T1E = rio[WS(vs, 2) + WS(rs, 3)];
Chris@82 655 T1F = T1D + T1E;
Chris@82 656 T22 = T1D - T1E;
Chris@82 657 T1A = rio[WS(vs, 2) + WS(rs, 1)];
Chris@82 658 T1B = rio[WS(vs, 2) + WS(rs, 4)];
Chris@82 659 T1C = T1A + T1B;
Chris@82 660 T21 = T1A - T1B;
Chris@82 661 }
Chris@82 662 T1I = KP559016994 * (T1C - T1F);
Chris@82 663 T2e = FNMS(KP587785252, T21, KP951056516 * T22);
Chris@82 664 T23 = FMA(KP951056516, T21, KP587785252 * T22);
Chris@82 665 T1G = T1C + T1F;
Chris@82 666 T1J = FNMS(KP250000000, T1G, T1z);
Chris@82 667 }
Chris@82 668 {
Chris@82 669 E T1Q, T1V, T1N, T1U;
Chris@82 670 T1X = iio[WS(vs, 2)];
Chris@82 671 {
Chris@82 672 E T1O, T1P, T1L, T1M;
Chris@82 673 T1O = iio[WS(vs, 2) + WS(rs, 2)];
Chris@82 674 T1P = iio[WS(vs, 2) + WS(rs, 3)];
Chris@82 675 T1Q = T1O - T1P;
Chris@82 676 T1V = T1O + T1P;
Chris@82 677 T1L = iio[WS(vs, 2) + WS(rs, 1)];
Chris@82 678 T1M = iio[WS(vs, 2) + WS(rs, 4)];
Chris@82 679 T1N = T1L - T1M;
Chris@82 680 T1U = T1L + T1M;
Chris@82 681 }
Chris@82 682 T1R = FMA(KP951056516, T1N, KP587785252 * T1Q);
Chris@82 683 T2b = FNMS(KP587785252, T1N, KP951056516 * T1Q);
Chris@82 684 T1W = KP559016994 * (T1U - T1V);
Chris@82 685 T1Y = T1U + T1V;
Chris@82 686 T1Z = FNMS(KP250000000, T1Y, T1X);
Chris@82 687 }
Chris@82 688 {
Chris@82 689 E T3o, T3t, T3l, T3s;
Chris@82 690 T3v = iio[WS(vs, 4)];
Chris@82 691 {
Chris@82 692 E T3m, T3n, T3j, T3k;
Chris@82 693 T3m = iio[WS(vs, 4) + WS(rs, 2)];
Chris@82 694 T3n = iio[WS(vs, 4) + WS(rs, 3)];
Chris@82 695 T3o = T3m - T3n;
Chris@82 696 T3t = T3m + T3n;
Chris@82 697 T3j = iio[WS(vs, 4) + WS(rs, 1)];
Chris@82 698 T3k = iio[WS(vs, 4) + WS(rs, 4)];
Chris@82 699 T3l = T3j - T3k;
Chris@82 700 T3s = T3j + T3k;
Chris@82 701 }
Chris@82 702 T3p = FMA(KP951056516, T3l, KP587785252 * T3o);
Chris@82 703 T3J = FNMS(KP587785252, T3l, KP951056516 * T3o);
Chris@82 704 T3u = KP559016994 * (T3s - T3t);
Chris@82 705 T3w = T3s + T3t;
Chris@82 706 T3x = FNMS(KP250000000, T3w, T3v);
Chris@82 707 }
Chris@82 708 {
Chris@82 709 E T3d, T3A, T3a, T3z;
Chris@82 710 T37 = rio[WS(vs, 4)];
Chris@82 711 {
Chris@82 712 E T3b, T3c, T38, T39;
Chris@82 713 T3b = rio[WS(vs, 4) + WS(rs, 2)];
Chris@82 714 T3c = rio[WS(vs, 4) + WS(rs, 3)];
Chris@82 715 T3d = T3b + T3c;
Chris@82 716 T3A = T3b - T3c;
Chris@82 717 T38 = rio[WS(vs, 4) + WS(rs, 1)];
Chris@82 718 T39 = rio[WS(vs, 4) + WS(rs, 4)];
Chris@82 719 T3a = T38 + T39;
Chris@82 720 T3z = T38 - T39;
Chris@82 721 }
Chris@82 722 T3g = KP559016994 * (T3a - T3d);
Chris@82 723 T3M = FNMS(KP587785252, T3z, KP951056516 * T3A);
Chris@82 724 T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
Chris@82 725 T3e = T3a + T3d;
Chris@82 726 T3h = FNMS(KP250000000, T3e, T37);
Chris@82 727 }
Chris@82 728 {
Chris@82 729 E T2r, T2O, T2o, T2N;
Chris@82 730 T2l = rio[WS(vs, 3)];
Chris@82 731 {
Chris@82 732 E T2p, T2q, T2m, T2n;
Chris@82 733 T2p = rio[WS(vs, 3) + WS(rs, 2)];
Chris@82 734 T2q = rio[WS(vs, 3) + WS(rs, 3)];
Chris@82 735 T2r = T2p + T2q;
Chris@82 736 T2O = T2p - T2q;
Chris@82 737 T2m = rio[WS(vs, 3) + WS(rs, 1)];
Chris@82 738 T2n = rio[WS(vs, 3) + WS(rs, 4)];
Chris@82 739 T2o = T2m + T2n;
Chris@82 740 T2N = T2m - T2n;
Chris@82 741 }
Chris@82 742 T2u = KP559016994 * (T2o - T2r);
Chris@82 743 T30 = FNMS(KP587785252, T2N, KP951056516 * T2O);
Chris@82 744 T2P = FMA(KP951056516, T2N, KP587785252 * T2O);
Chris@82 745 T2s = T2o + T2r;
Chris@82 746 T2v = FNMS(KP250000000, T2s, T2l);
Chris@82 747 }
Chris@82 748 {
Chris@82 749 E T2C, T2H, T2z, T2G;
Chris@82 750 T2J = iio[WS(vs, 3)];
Chris@82 751 {
Chris@82 752 E T2A, T2B, T2x, T2y;
Chris@82 753 T2A = iio[WS(vs, 3) + WS(rs, 2)];
Chris@82 754 T2B = iio[WS(vs, 3) + WS(rs, 3)];
Chris@82 755 T2C = T2A - T2B;
Chris@82 756 T2H = T2A + T2B;
Chris@82 757 T2x = iio[WS(vs, 3) + WS(rs, 1)];
Chris@82 758 T2y = iio[WS(vs, 3) + WS(rs, 4)];
Chris@82 759 T2z = T2x - T2y;
Chris@82 760 T2G = T2x + T2y;
Chris@82 761 }
Chris@82 762 T2D = FMA(KP951056516, T2z, KP587785252 * T2C);
Chris@82 763 T2X = FNMS(KP587785252, T2z, KP951056516 * T2C);
Chris@82 764 T2I = KP559016994 * (T2G - T2H);
Chris@82 765 T2K = T2G + T2H;
Chris@82 766 T2L = FNMS(KP250000000, T2K, T2J);
Chris@82 767 }
Chris@82 768 rio[0] = T1 + T8;
Chris@82 769 iio[0] = Tp + Tq;
Chris@82 770 rio[WS(rs, 1)] = TN + TU;
Chris@82 771 iio[WS(rs, 1)] = T1b + T1c;
Chris@82 772 rio[WS(rs, 2)] = T1z + T1G;
Chris@82 773 iio[WS(rs, 2)] = T1X + T1Y;
Chris@82 774 iio[WS(rs, 4)] = T3v + T3w;
Chris@82 775 rio[WS(rs, 4)] = T37 + T3e;
Chris@82 776 rio[WS(rs, 3)] = T2l + T2s;
Chris@82 777 iio[WS(rs, 3)] = T2J + T2K;
Chris@82 778 {
Chris@82 779 E Tk, Ty, Tw, TA, Tc, Ts;
Chris@82 780 Tc = Ta + Tb;
Chris@82 781 Tk = Tc + Tj;
Chris@82 782 Ty = Tc - Tj;
Chris@82 783 Ts = To + Tr;
Chris@82 784 Tw = Ts - Tv;
Chris@82 785 TA = Tv + Ts;
Chris@82 786 {
Chris@82 787 E T9, Tl, Tx, Tz;
Chris@82 788 T9 = W[0];
Chris@82 789 Tl = W[1];
Chris@82 790 rio[WS(vs, 1)] = FMA(T9, Tk, Tl * Tw);
Chris@82 791 iio[WS(vs, 1)] = FNMS(Tl, Tk, T9 * Tw);
Chris@82 792 Tx = W[6];
Chris@82 793 Tz = W[7];
Chris@82 794 rio[WS(vs, 4)] = FMA(Tx, Ty, Tz * TA);
Chris@82 795 iio[WS(vs, 4)] = FNMS(Tz, Ty, Tx * TA);
Chris@82 796 }
Chris@82 797 }
Chris@82 798 {
Chris@82 799 E TE, TK, TI, TM, TC, TH;
Chris@82 800 TC = Tb - Ta;
Chris@82 801 TE = TC - TD;
Chris@82 802 TK = TC + TD;
Chris@82 803 TH = Tr - To;
Chris@82 804 TI = TG + TH;
Chris@82 805 TM = TH - TG;
Chris@82 806 {
Chris@82 807 E TB, TF, TJ, TL;
Chris@82 808 TB = W[2];
Chris@82 809 TF = W[3];
Chris@82 810 rio[WS(vs, 2)] = FMA(TB, TE, TF * TI);
Chris@82 811 iio[WS(vs, 2)] = FNMS(TF, TE, TB * TI);
Chris@82 812 TJ = W[4];
Chris@82 813 TL = W[5];
Chris@82 814 rio[WS(vs, 3)] = FMA(TJ, TK, TL * TM);
Chris@82 815 iio[WS(vs, 3)] = FNMS(TL, TK, TJ * TM);
Chris@82 816 }
Chris@82 817 }
Chris@82 818 {
Chris@82 819 E T2c, T2i, T2g, T2k, T2a, T2f;
Chris@82 820 T2a = T1J - T1I;
Chris@82 821 T2c = T2a - T2b;
Chris@82 822 T2i = T2a + T2b;
Chris@82 823 T2f = T1Z - T1W;
Chris@82 824 T2g = T2e + T2f;
Chris@82 825 T2k = T2f - T2e;
Chris@82 826 {
Chris@82 827 E T29, T2d, T2h, T2j;
Chris@82 828 T29 = W[2];
Chris@82 829 T2d = W[3];
Chris@82 830 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T29, T2c, T2d * T2g);
Chris@82 831 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2d, T2c, T29 * T2g);
Chris@82 832 T2h = W[4];
Chris@82 833 T2j = W[5];
Chris@82 834 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2h, T2i, T2j * T2k);
Chris@82 835 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2j, T2i, T2h * T2k);
Chris@82 836 }
Chris@82 837 }
Chris@82 838 {
Chris@82 839 E T3K, T3Q, T3O, T3S, T3I, T3N;
Chris@82 840 T3I = T3h - T3g;
Chris@82 841 T3K = T3I - T3J;
Chris@82 842 T3Q = T3I + T3J;
Chris@82 843 T3N = T3x - T3u;
Chris@82 844 T3O = T3M + T3N;
Chris@82 845 T3S = T3N - T3M;
Chris@82 846 {
Chris@82 847 E T3H, T3L, T3P, T3R;
Chris@82 848 T3H = W[2];
Chris@82 849 T3L = W[3];
Chris@82 850 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T3H, T3K, T3L * T3O);
Chris@82 851 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T3L, T3K, T3H * T3O);
Chris@82 852 T3P = W[4];
Chris@82 853 T3R = W[5];
Chris@82 854 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T3P, T3Q, T3R * T3S);
Chris@82 855 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T3R, T3Q, T3P * T3S);
Chris@82 856 }
Chris@82 857 }
Chris@82 858 {
Chris@82 859 E T1S, T26, T24, T28, T1K, T20;
Chris@82 860 T1K = T1I + T1J;
Chris@82 861 T1S = T1K + T1R;
Chris@82 862 T26 = T1K - T1R;
Chris@82 863 T20 = T1W + T1Z;
Chris@82 864 T24 = T20 - T23;
Chris@82 865 T28 = T23 + T20;
Chris@82 866 {
Chris@82 867 E T1H, T1T, T25, T27;
Chris@82 868 T1H = W[0];
Chris@82 869 T1T = W[1];
Chris@82 870 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1H, T1S, T1T * T24);
Chris@82 871 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1T, T1S, T1H * T24);
Chris@82 872 T25 = W[6];
Chris@82 873 T27 = W[7];
Chris@82 874 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T25, T26, T27 * T28);
Chris@82 875 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T27, T26, T25 * T28);
Chris@82 876 }
Chris@82 877 }
Chris@82 878 {
Chris@82 879 E T2E, T2S, T2Q, T2U, T2w, T2M;
Chris@82 880 T2w = T2u + T2v;
Chris@82 881 T2E = T2w + T2D;
Chris@82 882 T2S = T2w - T2D;
Chris@82 883 T2M = T2I + T2L;
Chris@82 884 T2Q = T2M - T2P;
Chris@82 885 T2U = T2P + T2M;
Chris@82 886 {
Chris@82 887 E T2t, T2F, T2R, T2T;
Chris@82 888 T2t = W[0];
Chris@82 889 T2F = W[1];
Chris@82 890 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2t, T2E, T2F * T2Q);
Chris@82 891 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T2F, T2E, T2t * T2Q);
Chris@82 892 T2R = W[6];
Chris@82 893 T2T = W[7];
Chris@82 894 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T2R, T2S, T2T * T2U);
Chris@82 895 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T2T, T2S, T2R * T2U);
Chris@82 896 }
Chris@82 897 }
Chris@82 898 {
Chris@82 899 E T2Y, T34, T32, T36, T2W, T31;
Chris@82 900 T2W = T2v - T2u;
Chris@82 901 T2Y = T2W - T2X;
Chris@82 902 T34 = T2W + T2X;
Chris@82 903 T31 = T2L - T2I;
Chris@82 904 T32 = T30 + T31;
Chris@82 905 T36 = T31 - T30;
Chris@82 906 {
Chris@82 907 E T2V, T2Z, T33, T35;
Chris@82 908 T2V = W[2];
Chris@82 909 T2Z = W[3];
Chris@82 910 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T2V, T2Y, T2Z * T32);
Chris@82 911 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T2Z, T2Y, T2V * T32);
Chris@82 912 T33 = W[4];
Chris@82 913 T35 = W[5];
Chris@82 914 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T33, T34, T35 * T36);
Chris@82 915 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T35, T34, T33 * T36);
Chris@82 916 }
Chris@82 917 }
Chris@82 918 {
Chris@82 919 E T3q, T3E, T3C, T3G, T3i, T3y;
Chris@82 920 T3i = T3g + T3h;
Chris@82 921 T3q = T3i + T3p;
Chris@82 922 T3E = T3i - T3p;
Chris@82 923 T3y = T3u + T3x;
Chris@82 924 T3C = T3y - T3B;
Chris@82 925 T3G = T3B + T3y;
Chris@82 926 {
Chris@82 927 E T3f, T3r, T3D, T3F;
Chris@82 928 T3f = W[0];
Chris@82 929 T3r = W[1];
Chris@82 930 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3f, T3q, T3r * T3C);
Chris@82 931 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3r, T3q, T3f * T3C);
Chris@82 932 T3D = W[6];
Chris@82 933 T3F = W[7];
Chris@82 934 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T3D, T3E, T3F * T3G);
Chris@82 935 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T3F, T3E, T3D * T3G);
Chris@82 936 }
Chris@82 937 }
Chris@82 938 {
Chris@82 939 E T1q, T1w, T1u, T1y, T1o, T1t;
Chris@82 940 T1o = TX - TW;
Chris@82 941 T1q = T1o - T1p;
Chris@82 942 T1w = T1o + T1p;
Chris@82 943 T1t = T1d - T1a;
Chris@82 944 T1u = T1s + T1t;
Chris@82 945 T1y = T1t - T1s;
Chris@82 946 {
Chris@82 947 E T1n, T1r, T1v, T1x;
Chris@82 948 T1n = W[2];
Chris@82 949 T1r = W[3];
Chris@82 950 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1n, T1q, T1r * T1u);
Chris@82 951 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1r, T1q, T1n * T1u);
Chris@82 952 T1v = W[4];
Chris@82 953 T1x = W[5];
Chris@82 954 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
Chris@82 955 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
Chris@82 956 }
Chris@82 957 }
Chris@82 958 {
Chris@82 959 E T16, T1k, T1i, T1m, TY, T1e;
Chris@82 960 TY = TW + TX;
Chris@82 961 T16 = TY + T15;
Chris@82 962 T1k = TY - T15;
Chris@82 963 T1e = T1a + T1d;
Chris@82 964 T1i = T1e - T1h;
Chris@82 965 T1m = T1h + T1e;
Chris@82 966 {
Chris@82 967 E TV, T17, T1j, T1l;
Chris@82 968 TV = W[0];
Chris@82 969 T17 = W[1];
Chris@82 970 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TV, T16, T17 * T1i);
Chris@82 971 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T17, T16, TV * T1i);
Chris@82 972 T1j = W[6];
Chris@82 973 T1l = W[7];
Chris@82 974 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1j, T1k, T1l * T1m);
Chris@82 975 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1l, T1k, T1j * T1m);
Chris@82 976 }
Chris@82 977 }
Chris@82 978 }
Chris@82 979 }
Chris@82 980 }
Chris@82 981
Chris@82 982 static const tw_instr twinstr[] = {
Chris@82 983 {TW_FULL, 0, 5},
Chris@82 984 {TW_NEXT, 1, 0}
Chris@82 985 };
Chris@82 986
Chris@82 987 static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, {130, 70, 70, 0}, 0, 0, 0 };
Chris@82 988
Chris@82 989 void X(codelet_q1_5) (planner *p) {
Chris@82 990 X(kdft_difsq_register) (p, q1_5, &desc);
Chris@82 991 }
Chris@82 992 #endif