annotate src/fftw-3.3.3/dft/scalar/codelets/q1_5.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:36:23 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include q.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 200 FP additions, 170 FP multiplications,
Chris@10 32 * (or, 70 additions, 40 multiplications, 130 fused multiply/add),
Chris@10 33 * 104 stack variables, 4 constants, and 100 memory accesses
Chris@10 34 */
Chris@10 35 #include "q.h"
Chris@10 36
Chris@10 37 static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 43 {
Chris@10 44 INT m;
Chris@10 45 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@10 46 E T1x, T1w, T1v;
Chris@10 47 {
Chris@10 48 E T1, Tn, TM, Tw, Tb, T8, Ta, TV, Tq, Ts, TH, Tj, Tr, T1h, T1q;
Chris@10 49 E T1G, T12, T15, T1P, T14, T1k, T1m, T1B, T1d, T1l, T2b, T2k, T2A, T1W, T1Z;
Chris@10 50 E T3Z, T1Y, T2e, T2g, T2v, T27, T2f, T3D, T42, T44, T4j, T3V, T43, T2J, T48;
Chris@10 51 E T4o, T3K, T3N, T35, T3M, T2V, T3e, T3u, T2Q, T2T, T37, T30, T2S, T2W;
Chris@10 52 {
Chris@10 53 E T1Q, T2j, T1V, T1R;
Chris@10 54 {
Chris@10 55 E Tp, Ti, Td, Te;
Chris@10 56 {
Chris@10 57 E T5, T6, T2, T3, T7, Tv;
Chris@10 58 T1 = rio[0];
Chris@10 59 T5 = rio[WS(rs, 2)];
Chris@10 60 T6 = rio[WS(rs, 3)];
Chris@10 61 T2 = rio[WS(rs, 1)];
Chris@10 62 T3 = rio[WS(rs, 4)];
Chris@10 63 Tn = iio[0];
Chris@10 64 T7 = T5 + T6;
Chris@10 65 Tv = T5 - T6;
Chris@10 66 {
Chris@10 67 E T4, Tu, Tg, Th;
Chris@10 68 T4 = T2 + T3;
Chris@10 69 Tu = T2 - T3;
Chris@10 70 Tg = iio[WS(rs, 2)];
Chris@10 71 Th = iio[WS(rs, 3)];
Chris@10 72 TM = FNMS(KP618033988, Tu, Tv);
Chris@10 73 Tw = FMA(KP618033988, Tv, Tu);
Chris@10 74 Tb = T4 - T7;
Chris@10 75 T8 = T4 + T7;
Chris@10 76 Tp = Tg + Th;
Chris@10 77 Ti = Tg - Th;
Chris@10 78 Ta = FNMS(KP250000000, T8, T1);
Chris@10 79 Td = iio[WS(rs, 1)];
Chris@10 80 Te = iio[WS(rs, 4)];
Chris@10 81 }
Chris@10 82 }
Chris@10 83 {
Chris@10 84 E TW, T1p, T11, TX;
Chris@10 85 TV = rio[WS(vs, 1)];
Chris@10 86 {
Chris@10 87 E TZ, T10, Tf, To;
Chris@10 88 TZ = rio[WS(vs, 1) + WS(rs, 2)];
Chris@10 89 T10 = rio[WS(vs, 1) + WS(rs, 3)];
Chris@10 90 Tf = Td - Te;
Chris@10 91 To = Td + Te;
Chris@10 92 TW = rio[WS(vs, 1) + WS(rs, 1)];
Chris@10 93 T1p = TZ - T10;
Chris@10 94 T11 = TZ + T10;
Chris@10 95 Tq = To + Tp;
Chris@10 96 Ts = To - Tp;
Chris@10 97 TH = FNMS(KP618033988, Tf, Ti);
Chris@10 98 Tj = FMA(KP618033988, Ti, Tf);
Chris@10 99 Tr = FNMS(KP250000000, Tq, Tn);
Chris@10 100 TX = rio[WS(vs, 1) + WS(rs, 4)];
Chris@10 101 }
Chris@10 102 {
Chris@10 103 E T17, T1j, T1c, T18;
Chris@10 104 T1h = iio[WS(vs, 1)];
Chris@10 105 {
Chris@10 106 E T1a, T1b, TY, T1o;
Chris@10 107 T1a = iio[WS(vs, 1) + WS(rs, 2)];
Chris@10 108 T1b = iio[WS(vs, 1) + WS(rs, 3)];
Chris@10 109 TY = TW + TX;
Chris@10 110 T1o = TW - TX;
Chris@10 111 T17 = iio[WS(vs, 1) + WS(rs, 1)];
Chris@10 112 T1j = T1a + T1b;
Chris@10 113 T1c = T1a - T1b;
Chris@10 114 T1q = FMA(KP618033988, T1p, T1o);
Chris@10 115 T1G = FNMS(KP618033988, T1o, T1p);
Chris@10 116 T12 = TY + T11;
Chris@10 117 T15 = TY - T11;
Chris@10 118 T18 = iio[WS(vs, 1) + WS(rs, 4)];
Chris@10 119 }
Chris@10 120 T1P = rio[WS(vs, 2)];
Chris@10 121 T14 = FNMS(KP250000000, T12, TV);
Chris@10 122 {
Chris@10 123 E T1T, T1i, T19, T1U;
Chris@10 124 T1T = rio[WS(vs, 2) + WS(rs, 2)];
Chris@10 125 T1i = T17 + T18;
Chris@10 126 T19 = T17 - T18;
Chris@10 127 T1U = rio[WS(vs, 2) + WS(rs, 3)];
Chris@10 128 T1Q = rio[WS(vs, 2) + WS(rs, 1)];
Chris@10 129 T1k = T1i + T1j;
Chris@10 130 T1m = T1i - T1j;
Chris@10 131 T1B = FNMS(KP618033988, T19, T1c);
Chris@10 132 T1d = FMA(KP618033988, T1c, T19);
Chris@10 133 T2j = T1T - T1U;
Chris@10 134 T1V = T1T + T1U;
Chris@10 135 T1l = FNMS(KP250000000, T1k, T1h);
Chris@10 136 T1R = rio[WS(vs, 2) + WS(rs, 4)];
Chris@10 137 }
Chris@10 138 }
Chris@10 139 }
Chris@10 140 }
Chris@10 141 {
Chris@10 142 E T3P, T41, T3U, T3Q;
Chris@10 143 {
Chris@10 144 E T21, T2d, T26, T22;
Chris@10 145 T2b = iio[WS(vs, 2)];
Chris@10 146 {
Chris@10 147 E T24, T25, T1S, T2i;
Chris@10 148 T24 = iio[WS(vs, 2) + WS(rs, 2)];
Chris@10 149 T25 = iio[WS(vs, 2) + WS(rs, 3)];
Chris@10 150 T1S = T1Q + T1R;
Chris@10 151 T2i = T1Q - T1R;
Chris@10 152 T21 = iio[WS(vs, 2) + WS(rs, 1)];
Chris@10 153 T2d = T24 + T25;
Chris@10 154 T26 = T24 - T25;
Chris@10 155 T2k = FMA(KP618033988, T2j, T2i);
Chris@10 156 T2A = FNMS(KP618033988, T2i, T2j);
Chris@10 157 T1W = T1S + T1V;
Chris@10 158 T1Z = T1S - T1V;
Chris@10 159 T22 = iio[WS(vs, 2) + WS(rs, 4)];
Chris@10 160 }
Chris@10 161 T3Z = iio[WS(vs, 4)];
Chris@10 162 T1Y = FNMS(KP250000000, T1W, T1P);
Chris@10 163 {
Chris@10 164 E T3S, T2c, T23, T3T;
Chris@10 165 T3S = iio[WS(vs, 4) + WS(rs, 2)];
Chris@10 166 T2c = T21 + T22;
Chris@10 167 T23 = T21 - T22;
Chris@10 168 T3T = iio[WS(vs, 4) + WS(rs, 3)];
Chris@10 169 T3P = iio[WS(vs, 4) + WS(rs, 1)];
Chris@10 170 T2e = T2c + T2d;
Chris@10 171 T2g = T2c - T2d;
Chris@10 172 T2v = FNMS(KP618033988, T23, T26);
Chris@10 173 T27 = FMA(KP618033988, T26, T23);
Chris@10 174 T41 = T3S + T3T;
Chris@10 175 T3U = T3S - T3T;
Chris@10 176 T2f = FNMS(KP250000000, T2e, T2b);
Chris@10 177 T3Q = iio[WS(vs, 4) + WS(rs, 4)];
Chris@10 178 }
Chris@10 179 }
Chris@10 180 {
Chris@10 181 E T3E, T47, T3J, T3F;
Chris@10 182 T3D = rio[WS(vs, 4)];
Chris@10 183 {
Chris@10 184 E T3H, T3I, T3R, T40;
Chris@10 185 T3H = rio[WS(vs, 4) + WS(rs, 2)];
Chris@10 186 T3I = rio[WS(vs, 4) + WS(rs, 3)];
Chris@10 187 T3R = T3P - T3Q;
Chris@10 188 T40 = T3P + T3Q;
Chris@10 189 T3E = rio[WS(vs, 4) + WS(rs, 1)];
Chris@10 190 T47 = T3H - T3I;
Chris@10 191 T3J = T3H + T3I;
Chris@10 192 T42 = T40 + T41;
Chris@10 193 T44 = T40 - T41;
Chris@10 194 T4j = FNMS(KP618033988, T3R, T3U);
Chris@10 195 T3V = FMA(KP618033988, T3U, T3R);
Chris@10 196 T43 = FNMS(KP250000000, T42, T3Z);
Chris@10 197 T3F = rio[WS(vs, 4) + WS(rs, 4)];
Chris@10 198 }
Chris@10 199 {
Chris@10 200 E T2K, T3d, T2P, T2L;
Chris@10 201 T2J = rio[WS(vs, 3)];
Chris@10 202 {
Chris@10 203 E T2N, T2O, T3G, T46;
Chris@10 204 T2N = rio[WS(vs, 3) + WS(rs, 2)];
Chris@10 205 T2O = rio[WS(vs, 3) + WS(rs, 3)];
Chris@10 206 T3G = T3E + T3F;
Chris@10 207 T46 = T3E - T3F;
Chris@10 208 T2K = rio[WS(vs, 3) + WS(rs, 1)];
Chris@10 209 T3d = T2N - T2O;
Chris@10 210 T2P = T2N + T2O;
Chris@10 211 T48 = FMA(KP618033988, T47, T46);
Chris@10 212 T4o = FNMS(KP618033988, T46, T47);
Chris@10 213 T3K = T3G + T3J;
Chris@10 214 T3N = T3G - T3J;
Chris@10 215 T2L = rio[WS(vs, 3) + WS(rs, 4)];
Chris@10 216 }
Chris@10 217 T35 = iio[WS(vs, 3)];
Chris@10 218 T3M = FNMS(KP250000000, T3K, T3D);
Chris@10 219 {
Chris@10 220 E T2Y, T3c, T2M, T2Z;
Chris@10 221 T2Y = iio[WS(vs, 3) + WS(rs, 2)];
Chris@10 222 T3c = T2K - T2L;
Chris@10 223 T2M = T2K + T2L;
Chris@10 224 T2Z = iio[WS(vs, 3) + WS(rs, 3)];
Chris@10 225 T2V = iio[WS(vs, 3) + WS(rs, 1)];
Chris@10 226 T3e = FMA(KP618033988, T3d, T3c);
Chris@10 227 T3u = FNMS(KP618033988, T3c, T3d);
Chris@10 228 T2Q = T2M + T2P;
Chris@10 229 T2T = T2M - T2P;
Chris@10 230 T37 = T2Y + T2Z;
Chris@10 231 T30 = T2Y - T2Z;
Chris@10 232 T2S = FNMS(KP250000000, T2Q, T2J);
Chris@10 233 T2W = iio[WS(vs, 3) + WS(rs, 4)];
Chris@10 234 }
Chris@10 235 }
Chris@10 236 }
Chris@10 237 }
Chris@10 238 }
Chris@10 239 {
Chris@10 240 E T3a, T31, T3p, T39, T2X, T36, T38;
Chris@10 241 rio[0] = T1 + T8;
Chris@10 242 iio[0] = Tn + Tq;
Chris@10 243 rio[WS(rs, 1)] = TV + T12;
Chris@10 244 T2X = T2V - T2W;
Chris@10 245 T36 = T2V + T2W;
Chris@10 246 iio[WS(rs, 1)] = T1h + T1k;
Chris@10 247 rio[WS(rs, 2)] = T1P + T1W;
Chris@10 248 T3a = T36 - T37;
Chris@10 249 T38 = T36 + T37;
Chris@10 250 T31 = FMA(KP618033988, T30, T2X);
Chris@10 251 T3p = FNMS(KP618033988, T2X, T30);
Chris@10 252 T39 = FNMS(KP250000000, T38, T35);
Chris@10 253 iio[WS(rs, 2)] = T2b + T2e;
Chris@10 254 iio[WS(rs, 4)] = T3Z + T42;
Chris@10 255 rio[WS(rs, 4)] = T3D + T3K;
Chris@10 256 rio[WS(rs, 3)] = T2J + T2Q;
Chris@10 257 iio[WS(rs, 3)] = T35 + T38;
Chris@10 258 {
Chris@10 259 E T3O, T45, T2r, T2q, T2p, TT, TS, TR;
Chris@10 260 {
Chris@10 261 E TG, TL, TD, TC, TB, Tc, Tt;
Chris@10 262 TG = FNMS(KP559016994, Tb, Ta);
Chris@10 263 Tc = FMA(KP559016994, Tb, Ta);
Chris@10 264 Tt = FMA(KP559016994, Ts, Tr);
Chris@10 265 TL = FNMS(KP559016994, Ts, Tr);
Chris@10 266 {
Chris@10 267 E T9, Tm, Tk, TA, Tx;
Chris@10 268 T9 = W[0];
Chris@10 269 Tm = W[1];
Chris@10 270 Tk = FMA(KP951056516, Tj, Tc);
Chris@10 271 TA = FNMS(KP951056516, Tj, Tc);
Chris@10 272 Tx = FNMS(KP951056516, Tw, Tt);
Chris@10 273 TD = FMA(KP951056516, Tw, Tt);
Chris@10 274 {
Chris@10 275 E Tz, Tl, Ty, TE;
Chris@10 276 Tz = W[6];
Chris@10 277 Tl = T9 * Tk;
Chris@10 278 TC = W[7];
Chris@10 279 Ty = T9 * Tx;
Chris@10 280 TE = Tz * TD;
Chris@10 281 TB = Tz * TA;
Chris@10 282 rio[WS(vs, 1)] = FMA(Tm, Tx, Tl);
Chris@10 283 iio[WS(vs, 1)] = FNMS(Tm, Tk, Ty);
Chris@10 284 iio[WS(vs, 4)] = FNMS(TC, TA, TE);
Chris@10 285 }
Chris@10 286 }
Chris@10 287 rio[WS(vs, 4)] = FMA(TC, TD, TB);
Chris@10 288 {
Chris@10 289 E TF, TK, TI, TQ, TN;
Chris@10 290 TF = W[2];
Chris@10 291 TK = W[3];
Chris@10 292 TI = FNMS(KP951056516, TH, TG);
Chris@10 293 TQ = FMA(KP951056516, TH, TG);
Chris@10 294 TN = FMA(KP951056516, TM, TL);
Chris@10 295 TT = FNMS(KP951056516, TM, TL);
Chris@10 296 {
Chris@10 297 E TP, TJ, TO, TU;
Chris@10 298 TP = W[4];
Chris@10 299 TJ = TF * TI;
Chris@10 300 TS = W[5];
Chris@10 301 TO = TF * TN;
Chris@10 302 TU = TP * TT;
Chris@10 303 TR = TP * TQ;
Chris@10 304 rio[WS(vs, 2)] = FMA(TK, TN, TJ);
Chris@10 305 iio[WS(vs, 2)] = FNMS(TK, TI, TO);
Chris@10 306 iio[WS(vs, 3)] = FNMS(TS, TQ, TU);
Chris@10 307 }
Chris@10 308 }
Chris@10 309 }
Chris@10 310 rio[WS(vs, 3)] = FMA(TS, TT, TR);
Chris@10 311 {
Chris@10 312 E T20, T2h, T2H, T2G, T2F, T2u, T2z;
Chris@10 313 T20 = FMA(KP559016994, T1Z, T1Y);
Chris@10 314 T2u = FNMS(KP559016994, T1Z, T1Y);
Chris@10 315 T2z = FNMS(KP559016994, T2g, T2f);
Chris@10 316 T2h = FMA(KP559016994, T2g, T2f);
Chris@10 317 {
Chris@10 318 E T2t, T2y, T2w, T2E, T2B;
Chris@10 319 T2t = W[2];
Chris@10 320 T2y = W[3];
Chris@10 321 T2w = FNMS(KP951056516, T2v, T2u);
Chris@10 322 T2E = FMA(KP951056516, T2v, T2u);
Chris@10 323 T2B = FMA(KP951056516, T2A, T2z);
Chris@10 324 T2H = FNMS(KP951056516, T2A, T2z);
Chris@10 325 {
Chris@10 326 E T2D, T2x, T2C, T2I;
Chris@10 327 T2D = W[4];
Chris@10 328 T2x = T2t * T2w;
Chris@10 329 T2G = W[5];
Chris@10 330 T2C = T2t * T2B;
Chris@10 331 T2I = T2D * T2H;
Chris@10 332 T2F = T2D * T2E;
Chris@10 333 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2y, T2B, T2x);
Chris@10 334 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2y, T2w, T2C);
Chris@10 335 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2G, T2E, T2I);
Chris@10 336 }
Chris@10 337 }
Chris@10 338 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2G, T2H, T2F);
Chris@10 339 {
Chris@10 340 E T4v, T4u, T4t, T4i, T4n;
Chris@10 341 T3O = FMA(KP559016994, T3N, T3M);
Chris@10 342 T4i = FNMS(KP559016994, T3N, T3M);
Chris@10 343 T4n = FNMS(KP559016994, T44, T43);
Chris@10 344 T45 = FMA(KP559016994, T44, T43);
Chris@10 345 {
Chris@10 346 E T4h, T4m, T4k, T4s, T4p;
Chris@10 347 T4h = W[2];
Chris@10 348 T4m = W[3];
Chris@10 349 T4k = FNMS(KP951056516, T4j, T4i);
Chris@10 350 T4s = FMA(KP951056516, T4j, T4i);
Chris@10 351 T4p = FMA(KP951056516, T4o, T4n);
Chris@10 352 T4v = FNMS(KP951056516, T4o, T4n);
Chris@10 353 {
Chris@10 354 E T4r, T4l, T4q, T4w;
Chris@10 355 T4r = W[4];
Chris@10 356 T4l = T4h * T4k;
Chris@10 357 T4u = W[5];
Chris@10 358 T4q = T4h * T4p;
Chris@10 359 T4w = T4r * T4v;
Chris@10 360 T4t = T4r * T4s;
Chris@10 361 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4m, T4p, T4l);
Chris@10 362 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4m, T4k, T4q);
Chris@10 363 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4u, T4s, T4w);
Chris@10 364 }
Chris@10 365 }
Chris@10 366 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4u, T4v, T4t);
Chris@10 367 {
Chris@10 368 E T1X, T2a, T28, T2o, T2l;
Chris@10 369 T1X = W[0];
Chris@10 370 T2a = W[1];
Chris@10 371 T28 = FMA(KP951056516, T27, T20);
Chris@10 372 T2o = FNMS(KP951056516, T27, T20);
Chris@10 373 T2l = FNMS(KP951056516, T2k, T2h);
Chris@10 374 T2r = FMA(KP951056516, T2k, T2h);
Chris@10 375 {
Chris@10 376 E T2n, T29, T2m, T2s;
Chris@10 377 T2n = W[6];
Chris@10 378 T29 = T1X * T28;
Chris@10 379 T2q = W[7];
Chris@10 380 T2m = T1X * T2l;
Chris@10 381 T2s = T2n * T2r;
Chris@10 382 T2p = T2n * T2o;
Chris@10 383 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2a, T2l, T29);
Chris@10 384 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2a, T28, T2m);
Chris@10 385 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2q, T2o, T2s);
Chris@10 386 }
Chris@10 387 }
Chris@10 388 }
Chris@10 389 }
Chris@10 390 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2q, T2r, T2p);
Chris@10 391 {
Chris@10 392 E T3B, T3A, T3z, T4f, T4e, T4d;
Chris@10 393 {
Chris@10 394 E T3o, T3t, T3l, T3k, T3j, T2U, T3b;
Chris@10 395 T3o = FNMS(KP559016994, T2T, T2S);
Chris@10 396 T2U = FMA(KP559016994, T2T, T2S);
Chris@10 397 T3b = FMA(KP559016994, T3a, T39);
Chris@10 398 T3t = FNMS(KP559016994, T3a, T39);
Chris@10 399 {
Chris@10 400 E T2R, T34, T32, T3i, T3f;
Chris@10 401 T2R = W[0];
Chris@10 402 T34 = W[1];
Chris@10 403 T32 = FMA(KP951056516, T31, T2U);
Chris@10 404 T3i = FNMS(KP951056516, T31, T2U);
Chris@10 405 T3f = FNMS(KP951056516, T3e, T3b);
Chris@10 406 T3l = FMA(KP951056516, T3e, T3b);
Chris@10 407 {
Chris@10 408 E T3h, T33, T3g, T3m;
Chris@10 409 T3h = W[6];
Chris@10 410 T33 = T2R * T32;
Chris@10 411 T3k = W[7];
Chris@10 412 T3g = T2R * T3f;
Chris@10 413 T3m = T3h * T3l;
Chris@10 414 T3j = T3h * T3i;
Chris@10 415 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T34, T3f, T33);
Chris@10 416 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T34, T32, T3g);
Chris@10 417 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3k, T3i, T3m);
Chris@10 418 }
Chris@10 419 }
Chris@10 420 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3k, T3l, T3j);
Chris@10 421 {
Chris@10 422 E T3n, T3s, T3q, T3y, T3v;
Chris@10 423 T3n = W[2];
Chris@10 424 T3s = W[3];
Chris@10 425 T3q = FNMS(KP951056516, T3p, T3o);
Chris@10 426 T3y = FMA(KP951056516, T3p, T3o);
Chris@10 427 T3v = FMA(KP951056516, T3u, T3t);
Chris@10 428 T3B = FNMS(KP951056516, T3u, T3t);
Chris@10 429 {
Chris@10 430 E T3x, T3r, T3w, T3C;
Chris@10 431 T3x = W[4];
Chris@10 432 T3r = T3n * T3q;
Chris@10 433 T3A = W[5];
Chris@10 434 T3w = T3n * T3v;
Chris@10 435 T3C = T3x * T3B;
Chris@10 436 T3z = T3x * T3y;
Chris@10 437 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3s, T3v, T3r);
Chris@10 438 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3s, T3q, T3w);
Chris@10 439 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3A, T3y, T3C);
Chris@10 440 }
Chris@10 441 }
Chris@10 442 }
Chris@10 443 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3A, T3B, T3z);
Chris@10 444 {
Chris@10 445 E T3L, T3Y, T3W, T4c, T49;
Chris@10 446 T3L = W[0];
Chris@10 447 T3Y = W[1];
Chris@10 448 T3W = FMA(KP951056516, T3V, T3O);
Chris@10 449 T4c = FNMS(KP951056516, T3V, T3O);
Chris@10 450 T49 = FNMS(KP951056516, T48, T45);
Chris@10 451 T4f = FMA(KP951056516, T48, T45);
Chris@10 452 {
Chris@10 453 E T4b, T3X, T4a, T4g;
Chris@10 454 T4b = W[6];
Chris@10 455 T3X = T3L * T3W;
Chris@10 456 T4e = W[7];
Chris@10 457 T4a = T3L * T49;
Chris@10 458 T4g = T4b * T4f;
Chris@10 459 T4d = T4b * T4c;
Chris@10 460 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3Y, T49, T3X);
Chris@10 461 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3Y, T3W, T4a);
Chris@10 462 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4e, T4c, T4g);
Chris@10 463 }
Chris@10 464 }
Chris@10 465 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4e, T4f, T4d);
Chris@10 466 {
Chris@10 467 E T16, T1n, T1N, T1M, T1L, T1A, T1F;
Chris@10 468 T16 = FMA(KP559016994, T15, T14);
Chris@10 469 T1A = FNMS(KP559016994, T15, T14);
Chris@10 470 T1F = FNMS(KP559016994, T1m, T1l);
Chris@10 471 T1n = FMA(KP559016994, T1m, T1l);
Chris@10 472 {
Chris@10 473 E T1z, T1E, T1C, T1K, T1H;
Chris@10 474 T1z = W[2];
Chris@10 475 T1E = W[3];
Chris@10 476 T1C = FNMS(KP951056516, T1B, T1A);
Chris@10 477 T1K = FMA(KP951056516, T1B, T1A);
Chris@10 478 T1H = FMA(KP951056516, T1G, T1F);
Chris@10 479 T1N = FNMS(KP951056516, T1G, T1F);
Chris@10 480 {
Chris@10 481 E T1J, T1D, T1I, T1O;
Chris@10 482 T1J = W[4];
Chris@10 483 T1D = T1z * T1C;
Chris@10 484 T1M = W[5];
Chris@10 485 T1I = T1z * T1H;
Chris@10 486 T1O = T1J * T1N;
Chris@10 487 T1L = T1J * T1K;
Chris@10 488 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1E, T1H, T1D);
Chris@10 489 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1E, T1C, T1I);
Chris@10 490 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
Chris@10 491 }
Chris@10 492 }
Chris@10 493 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
Chris@10 494 {
Chris@10 495 E T13, T1g, T1e, T1u, T1r;
Chris@10 496 T13 = W[0];
Chris@10 497 T1g = W[1];
Chris@10 498 T1e = FMA(KP951056516, T1d, T16);
Chris@10 499 T1u = FNMS(KP951056516, T1d, T16);
Chris@10 500 T1r = FNMS(KP951056516, T1q, T1n);
Chris@10 501 T1x = FMA(KP951056516, T1q, T1n);
Chris@10 502 {
Chris@10 503 E T1t, T1f, T1s, T1y;
Chris@10 504 T1t = W[6];
Chris@10 505 T1f = T13 * T1e;
Chris@10 506 T1w = W[7];
Chris@10 507 T1s = T13 * T1r;
Chris@10 508 T1y = T1t * T1x;
Chris@10 509 T1v = T1t * T1u;
Chris@10 510 rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1g, T1r, T1f);
Chris@10 511 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1g, T1e, T1s);
Chris@10 512 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1w, T1u, T1y);
Chris@10 513 }
Chris@10 514 }
Chris@10 515 }
Chris@10 516 }
Chris@10 517 }
Chris@10 518 }
Chris@10 519 }
Chris@10 520 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1w, T1x, T1v);
Chris@10 521 }
Chris@10 522 }
Chris@10 523 }
Chris@10 524
Chris@10 525 static const tw_instr twinstr[] = {
Chris@10 526 {TW_FULL, 0, 5},
Chris@10 527 {TW_NEXT, 1, 0}
Chris@10 528 };
Chris@10 529
Chris@10 530 static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, {70, 40, 130, 0}, 0, 0, 0 };
Chris@10 531
Chris@10 532 void X(codelet_q1_5) (planner *p) {
Chris@10 533 X(kdft_difsq_register) (p, q1_5, &desc);
Chris@10 534 }
Chris@10 535 #else /* HAVE_FMA */
Chris@10 536
Chris@10 537 /* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include q.h */
Chris@10 538
Chris@10 539 /*
Chris@10 540 * This function contains 200 FP additions, 140 FP multiplications,
Chris@10 541 * (or, 130 additions, 70 multiplications, 70 fused multiply/add),
Chris@10 542 * 75 stack variables, 4 constants, and 100 memory accesses
Chris@10 543 */
Chris@10 544 #include "q.h"
Chris@10 545
Chris@10 546 static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@10 547 {
Chris@10 548 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 549 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 550 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 551 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 552 {
Chris@10 553 INT m;
Chris@10 554 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@10 555 E T1, Ta, TG, Tv, T8, Tb, Tp, Tj, TD, To, Tq, Tr, TN, TW, T1s;
Chris@10 556 E T1h, TU, TX, T1b, T15, T1p, T1a, T1c, T1d, T1z, T1I, T2e, T23, T1G, T1J;
Chris@10 557 E T1X, T1R, T2b, T1W, T1Y, T1Z, T3v, T3p, T3J, T3u, T3w, T3x, T37, T3g, T3M;
Chris@10 558 E T3B, T3e, T3h, T2l, T2u, T30, T2P, T2s, T2v, T2J, T2D, T2X, T2I, T2K, T2L;
Chris@10 559 {
Chris@10 560 E T7, Tu, T4, Tt;
Chris@10 561 T1 = rio[0];
Chris@10 562 {
Chris@10 563 E T5, T6, T2, T3;
Chris@10 564 T5 = rio[WS(rs, 2)];
Chris@10 565 T6 = rio[WS(rs, 3)];
Chris@10 566 T7 = T5 + T6;
Chris@10 567 Tu = T5 - T6;
Chris@10 568 T2 = rio[WS(rs, 1)];
Chris@10 569 T3 = rio[WS(rs, 4)];
Chris@10 570 T4 = T2 + T3;
Chris@10 571 Tt = T2 - T3;
Chris@10 572 }
Chris@10 573 Ta = KP559016994 * (T4 - T7);
Chris@10 574 TG = FNMS(KP587785252, Tt, KP951056516 * Tu);
Chris@10 575 Tv = FMA(KP951056516, Tt, KP587785252 * Tu);
Chris@10 576 T8 = T4 + T7;
Chris@10 577 Tb = FNMS(KP250000000, T8, T1);
Chris@10 578 }
Chris@10 579 {
Chris@10 580 E Ti, Tn, Tf, Tm;
Chris@10 581 Tp = iio[0];
Chris@10 582 {
Chris@10 583 E Tg, Th, Td, Te;
Chris@10 584 Tg = iio[WS(rs, 2)];
Chris@10 585 Th = iio[WS(rs, 3)];
Chris@10 586 Ti = Tg - Th;
Chris@10 587 Tn = Tg + Th;
Chris@10 588 Td = iio[WS(rs, 1)];
Chris@10 589 Te = iio[WS(rs, 4)];
Chris@10 590 Tf = Td - Te;
Chris@10 591 Tm = Td + Te;
Chris@10 592 }
Chris@10 593 Tj = FMA(KP951056516, Tf, KP587785252 * Ti);
Chris@10 594 TD = FNMS(KP587785252, Tf, KP951056516 * Ti);
Chris@10 595 To = KP559016994 * (Tm - Tn);
Chris@10 596 Tq = Tm + Tn;
Chris@10 597 Tr = FNMS(KP250000000, Tq, Tp);
Chris@10 598 }
Chris@10 599 {
Chris@10 600 E TT, T1g, TQ, T1f;
Chris@10 601 TN = rio[WS(vs, 1)];
Chris@10 602 {
Chris@10 603 E TR, TS, TO, TP;
Chris@10 604 TR = rio[WS(vs, 1) + WS(rs, 2)];
Chris@10 605 TS = rio[WS(vs, 1) + WS(rs, 3)];
Chris@10 606 TT = TR + TS;
Chris@10 607 T1g = TR - TS;
Chris@10 608 TO = rio[WS(vs, 1) + WS(rs, 1)];
Chris@10 609 TP = rio[WS(vs, 1) + WS(rs, 4)];
Chris@10 610 TQ = TO + TP;
Chris@10 611 T1f = TO - TP;
Chris@10 612 }
Chris@10 613 TW = KP559016994 * (TQ - TT);
Chris@10 614 T1s = FNMS(KP587785252, T1f, KP951056516 * T1g);
Chris@10 615 T1h = FMA(KP951056516, T1f, KP587785252 * T1g);
Chris@10 616 TU = TQ + TT;
Chris@10 617 TX = FNMS(KP250000000, TU, TN);
Chris@10 618 }
Chris@10 619 {
Chris@10 620 E T14, T19, T11, T18;
Chris@10 621 T1b = iio[WS(vs, 1)];
Chris@10 622 {
Chris@10 623 E T12, T13, TZ, T10;
Chris@10 624 T12 = iio[WS(vs, 1) + WS(rs, 2)];
Chris@10 625 T13 = iio[WS(vs, 1) + WS(rs, 3)];
Chris@10 626 T14 = T12 - T13;
Chris@10 627 T19 = T12 + T13;
Chris@10 628 TZ = iio[WS(vs, 1) + WS(rs, 1)];
Chris@10 629 T10 = iio[WS(vs, 1) + WS(rs, 4)];
Chris@10 630 T11 = TZ - T10;
Chris@10 631 T18 = TZ + T10;
Chris@10 632 }
Chris@10 633 T15 = FMA(KP951056516, T11, KP587785252 * T14);
Chris@10 634 T1p = FNMS(KP587785252, T11, KP951056516 * T14);
Chris@10 635 T1a = KP559016994 * (T18 - T19);
Chris@10 636 T1c = T18 + T19;
Chris@10 637 T1d = FNMS(KP250000000, T1c, T1b);
Chris@10 638 }
Chris@10 639 {
Chris@10 640 E T1F, T22, T1C, T21;
Chris@10 641 T1z = rio[WS(vs, 2)];
Chris@10 642 {
Chris@10 643 E T1D, T1E, T1A, T1B;
Chris@10 644 T1D = rio[WS(vs, 2) + WS(rs, 2)];
Chris@10 645 T1E = rio[WS(vs, 2) + WS(rs, 3)];
Chris@10 646 T1F = T1D + T1E;
Chris@10 647 T22 = T1D - T1E;
Chris@10 648 T1A = rio[WS(vs, 2) + WS(rs, 1)];
Chris@10 649 T1B = rio[WS(vs, 2) + WS(rs, 4)];
Chris@10 650 T1C = T1A + T1B;
Chris@10 651 T21 = T1A - T1B;
Chris@10 652 }
Chris@10 653 T1I = KP559016994 * (T1C - T1F);
Chris@10 654 T2e = FNMS(KP587785252, T21, KP951056516 * T22);
Chris@10 655 T23 = FMA(KP951056516, T21, KP587785252 * T22);
Chris@10 656 T1G = T1C + T1F;
Chris@10 657 T1J = FNMS(KP250000000, T1G, T1z);
Chris@10 658 }
Chris@10 659 {
Chris@10 660 E T1Q, T1V, T1N, T1U;
Chris@10 661 T1X = iio[WS(vs, 2)];
Chris@10 662 {
Chris@10 663 E T1O, T1P, T1L, T1M;
Chris@10 664 T1O = iio[WS(vs, 2) + WS(rs, 2)];
Chris@10 665 T1P = iio[WS(vs, 2) + WS(rs, 3)];
Chris@10 666 T1Q = T1O - T1P;
Chris@10 667 T1V = T1O + T1P;
Chris@10 668 T1L = iio[WS(vs, 2) + WS(rs, 1)];
Chris@10 669 T1M = iio[WS(vs, 2) + WS(rs, 4)];
Chris@10 670 T1N = T1L - T1M;
Chris@10 671 T1U = T1L + T1M;
Chris@10 672 }
Chris@10 673 T1R = FMA(KP951056516, T1N, KP587785252 * T1Q);
Chris@10 674 T2b = FNMS(KP587785252, T1N, KP951056516 * T1Q);
Chris@10 675 T1W = KP559016994 * (T1U - T1V);
Chris@10 676 T1Y = T1U + T1V;
Chris@10 677 T1Z = FNMS(KP250000000, T1Y, T1X);
Chris@10 678 }
Chris@10 679 {
Chris@10 680 E T3o, T3t, T3l, T3s;
Chris@10 681 T3v = iio[WS(vs, 4)];
Chris@10 682 {
Chris@10 683 E T3m, T3n, T3j, T3k;
Chris@10 684 T3m = iio[WS(vs, 4) + WS(rs, 2)];
Chris@10 685 T3n = iio[WS(vs, 4) + WS(rs, 3)];
Chris@10 686 T3o = T3m - T3n;
Chris@10 687 T3t = T3m + T3n;
Chris@10 688 T3j = iio[WS(vs, 4) + WS(rs, 1)];
Chris@10 689 T3k = iio[WS(vs, 4) + WS(rs, 4)];
Chris@10 690 T3l = T3j - T3k;
Chris@10 691 T3s = T3j + T3k;
Chris@10 692 }
Chris@10 693 T3p = FMA(KP951056516, T3l, KP587785252 * T3o);
Chris@10 694 T3J = FNMS(KP587785252, T3l, KP951056516 * T3o);
Chris@10 695 T3u = KP559016994 * (T3s - T3t);
Chris@10 696 T3w = T3s + T3t;
Chris@10 697 T3x = FNMS(KP250000000, T3w, T3v);
Chris@10 698 }
Chris@10 699 {
Chris@10 700 E T3d, T3A, T3a, T3z;
Chris@10 701 T37 = rio[WS(vs, 4)];
Chris@10 702 {
Chris@10 703 E T3b, T3c, T38, T39;
Chris@10 704 T3b = rio[WS(vs, 4) + WS(rs, 2)];
Chris@10 705 T3c = rio[WS(vs, 4) + WS(rs, 3)];
Chris@10 706 T3d = T3b + T3c;
Chris@10 707 T3A = T3b - T3c;
Chris@10 708 T38 = rio[WS(vs, 4) + WS(rs, 1)];
Chris@10 709 T39 = rio[WS(vs, 4) + WS(rs, 4)];
Chris@10 710 T3a = T38 + T39;
Chris@10 711 T3z = T38 - T39;
Chris@10 712 }
Chris@10 713 T3g = KP559016994 * (T3a - T3d);
Chris@10 714 T3M = FNMS(KP587785252, T3z, KP951056516 * T3A);
Chris@10 715 T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
Chris@10 716 T3e = T3a + T3d;
Chris@10 717 T3h = FNMS(KP250000000, T3e, T37);
Chris@10 718 }
Chris@10 719 {
Chris@10 720 E T2r, T2O, T2o, T2N;
Chris@10 721 T2l = rio[WS(vs, 3)];
Chris@10 722 {
Chris@10 723 E T2p, T2q, T2m, T2n;
Chris@10 724 T2p = rio[WS(vs, 3) + WS(rs, 2)];
Chris@10 725 T2q = rio[WS(vs, 3) + WS(rs, 3)];
Chris@10 726 T2r = T2p + T2q;
Chris@10 727 T2O = T2p - T2q;
Chris@10 728 T2m = rio[WS(vs, 3) + WS(rs, 1)];
Chris@10 729 T2n = rio[WS(vs, 3) + WS(rs, 4)];
Chris@10 730 T2o = T2m + T2n;
Chris@10 731 T2N = T2m - T2n;
Chris@10 732 }
Chris@10 733 T2u = KP559016994 * (T2o - T2r);
Chris@10 734 T30 = FNMS(KP587785252, T2N, KP951056516 * T2O);
Chris@10 735 T2P = FMA(KP951056516, T2N, KP587785252 * T2O);
Chris@10 736 T2s = T2o + T2r;
Chris@10 737 T2v = FNMS(KP250000000, T2s, T2l);
Chris@10 738 }
Chris@10 739 {
Chris@10 740 E T2C, T2H, T2z, T2G;
Chris@10 741 T2J = iio[WS(vs, 3)];
Chris@10 742 {
Chris@10 743 E T2A, T2B, T2x, T2y;
Chris@10 744 T2A = iio[WS(vs, 3) + WS(rs, 2)];
Chris@10 745 T2B = iio[WS(vs, 3) + WS(rs, 3)];
Chris@10 746 T2C = T2A - T2B;
Chris@10 747 T2H = T2A + T2B;
Chris@10 748 T2x = iio[WS(vs, 3) + WS(rs, 1)];
Chris@10 749 T2y = iio[WS(vs, 3) + WS(rs, 4)];
Chris@10 750 T2z = T2x - T2y;
Chris@10 751 T2G = T2x + T2y;
Chris@10 752 }
Chris@10 753 T2D = FMA(KP951056516, T2z, KP587785252 * T2C);
Chris@10 754 T2X = FNMS(KP587785252, T2z, KP951056516 * T2C);
Chris@10 755 T2I = KP559016994 * (T2G - T2H);
Chris@10 756 T2K = T2G + T2H;
Chris@10 757 T2L = FNMS(KP250000000, T2K, T2J);
Chris@10 758 }
Chris@10 759 rio[0] = T1 + T8;
Chris@10 760 iio[0] = Tp + Tq;
Chris@10 761 rio[WS(rs, 1)] = TN + TU;
Chris@10 762 iio[WS(rs, 1)] = T1b + T1c;
Chris@10 763 rio[WS(rs, 2)] = T1z + T1G;
Chris@10 764 iio[WS(rs, 2)] = T1X + T1Y;
Chris@10 765 iio[WS(rs, 4)] = T3v + T3w;
Chris@10 766 rio[WS(rs, 4)] = T37 + T3e;
Chris@10 767 rio[WS(rs, 3)] = T2l + T2s;
Chris@10 768 iio[WS(rs, 3)] = T2J + T2K;
Chris@10 769 {
Chris@10 770 E Tk, Ty, Tw, TA, Tc, Ts;
Chris@10 771 Tc = Ta + Tb;
Chris@10 772 Tk = Tc + Tj;
Chris@10 773 Ty = Tc - Tj;
Chris@10 774 Ts = To + Tr;
Chris@10 775 Tw = Ts - Tv;
Chris@10 776 TA = Tv + Ts;
Chris@10 777 {
Chris@10 778 E T9, Tl, Tx, Tz;
Chris@10 779 T9 = W[0];
Chris@10 780 Tl = W[1];
Chris@10 781 rio[WS(vs, 1)] = FMA(T9, Tk, Tl * Tw);
Chris@10 782 iio[WS(vs, 1)] = FNMS(Tl, Tk, T9 * Tw);
Chris@10 783 Tx = W[6];
Chris@10 784 Tz = W[7];
Chris@10 785 rio[WS(vs, 4)] = FMA(Tx, Ty, Tz * TA);
Chris@10 786 iio[WS(vs, 4)] = FNMS(Tz, Ty, Tx * TA);
Chris@10 787 }
Chris@10 788 }
Chris@10 789 {
Chris@10 790 E TE, TK, TI, TM, TC, TH;
Chris@10 791 TC = Tb - Ta;
Chris@10 792 TE = TC - TD;
Chris@10 793 TK = TC + TD;
Chris@10 794 TH = Tr - To;
Chris@10 795 TI = TG + TH;
Chris@10 796 TM = TH - TG;
Chris@10 797 {
Chris@10 798 E TB, TF, TJ, TL;
Chris@10 799 TB = W[2];
Chris@10 800 TF = W[3];
Chris@10 801 rio[WS(vs, 2)] = FMA(TB, TE, TF * TI);
Chris@10 802 iio[WS(vs, 2)] = FNMS(TF, TE, TB * TI);
Chris@10 803 TJ = W[4];
Chris@10 804 TL = W[5];
Chris@10 805 rio[WS(vs, 3)] = FMA(TJ, TK, TL * TM);
Chris@10 806 iio[WS(vs, 3)] = FNMS(TL, TK, TJ * TM);
Chris@10 807 }
Chris@10 808 }
Chris@10 809 {
Chris@10 810 E T2c, T2i, T2g, T2k, T2a, T2f;
Chris@10 811 T2a = T1J - T1I;
Chris@10 812 T2c = T2a - T2b;
Chris@10 813 T2i = T2a + T2b;
Chris@10 814 T2f = T1Z - T1W;
Chris@10 815 T2g = T2e + T2f;
Chris@10 816 T2k = T2f - T2e;
Chris@10 817 {
Chris@10 818 E T29, T2d, T2h, T2j;
Chris@10 819 T29 = W[2];
Chris@10 820 T2d = W[3];
Chris@10 821 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T29, T2c, T2d * T2g);
Chris@10 822 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2d, T2c, T29 * T2g);
Chris@10 823 T2h = W[4];
Chris@10 824 T2j = W[5];
Chris@10 825 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2h, T2i, T2j * T2k);
Chris@10 826 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2j, T2i, T2h * T2k);
Chris@10 827 }
Chris@10 828 }
Chris@10 829 {
Chris@10 830 E T3K, T3Q, T3O, T3S, T3I, T3N;
Chris@10 831 T3I = T3h - T3g;
Chris@10 832 T3K = T3I - T3J;
Chris@10 833 T3Q = T3I + T3J;
Chris@10 834 T3N = T3x - T3u;
Chris@10 835 T3O = T3M + T3N;
Chris@10 836 T3S = T3N - T3M;
Chris@10 837 {
Chris@10 838 E T3H, T3L, T3P, T3R;
Chris@10 839 T3H = W[2];
Chris@10 840 T3L = W[3];
Chris@10 841 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T3H, T3K, T3L * T3O);
Chris@10 842 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T3L, T3K, T3H * T3O);
Chris@10 843 T3P = W[4];
Chris@10 844 T3R = W[5];
Chris@10 845 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T3P, T3Q, T3R * T3S);
Chris@10 846 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T3R, T3Q, T3P * T3S);
Chris@10 847 }
Chris@10 848 }
Chris@10 849 {
Chris@10 850 E T1S, T26, T24, T28, T1K, T20;
Chris@10 851 T1K = T1I + T1J;
Chris@10 852 T1S = T1K + T1R;
Chris@10 853 T26 = T1K - T1R;
Chris@10 854 T20 = T1W + T1Z;
Chris@10 855 T24 = T20 - T23;
Chris@10 856 T28 = T23 + T20;
Chris@10 857 {
Chris@10 858 E T1H, T1T, T25, T27;
Chris@10 859 T1H = W[0];
Chris@10 860 T1T = W[1];
Chris@10 861 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1H, T1S, T1T * T24);
Chris@10 862 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1T, T1S, T1H * T24);
Chris@10 863 T25 = W[6];
Chris@10 864 T27 = W[7];
Chris@10 865 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T25, T26, T27 * T28);
Chris@10 866 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T27, T26, T25 * T28);
Chris@10 867 }
Chris@10 868 }
Chris@10 869 {
Chris@10 870 E T2E, T2S, T2Q, T2U, T2w, T2M;
Chris@10 871 T2w = T2u + T2v;
Chris@10 872 T2E = T2w + T2D;
Chris@10 873 T2S = T2w - T2D;
Chris@10 874 T2M = T2I + T2L;
Chris@10 875 T2Q = T2M - T2P;
Chris@10 876 T2U = T2P + T2M;
Chris@10 877 {
Chris@10 878 E T2t, T2F, T2R, T2T;
Chris@10 879 T2t = W[0];
Chris@10 880 T2F = W[1];
Chris@10 881 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2t, T2E, T2F * T2Q);
Chris@10 882 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T2F, T2E, T2t * T2Q);
Chris@10 883 T2R = W[6];
Chris@10 884 T2T = W[7];
Chris@10 885 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T2R, T2S, T2T * T2U);
Chris@10 886 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T2T, T2S, T2R * T2U);
Chris@10 887 }
Chris@10 888 }
Chris@10 889 {
Chris@10 890 E T2Y, T34, T32, T36, T2W, T31;
Chris@10 891 T2W = T2v - T2u;
Chris@10 892 T2Y = T2W - T2X;
Chris@10 893 T34 = T2W + T2X;
Chris@10 894 T31 = T2L - T2I;
Chris@10 895 T32 = T30 + T31;
Chris@10 896 T36 = T31 - T30;
Chris@10 897 {
Chris@10 898 E T2V, T2Z, T33, T35;
Chris@10 899 T2V = W[2];
Chris@10 900 T2Z = W[3];
Chris@10 901 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T2V, T2Y, T2Z * T32);
Chris@10 902 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T2Z, T2Y, T2V * T32);
Chris@10 903 T33 = W[4];
Chris@10 904 T35 = W[5];
Chris@10 905 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T33, T34, T35 * T36);
Chris@10 906 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T35, T34, T33 * T36);
Chris@10 907 }
Chris@10 908 }
Chris@10 909 {
Chris@10 910 E T3q, T3E, T3C, T3G, T3i, T3y;
Chris@10 911 T3i = T3g + T3h;
Chris@10 912 T3q = T3i + T3p;
Chris@10 913 T3E = T3i - T3p;
Chris@10 914 T3y = T3u + T3x;
Chris@10 915 T3C = T3y - T3B;
Chris@10 916 T3G = T3B + T3y;
Chris@10 917 {
Chris@10 918 E T3f, T3r, T3D, T3F;
Chris@10 919 T3f = W[0];
Chris@10 920 T3r = W[1];
Chris@10 921 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3f, T3q, T3r * T3C);
Chris@10 922 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3r, T3q, T3f * T3C);
Chris@10 923 T3D = W[6];
Chris@10 924 T3F = W[7];
Chris@10 925 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T3D, T3E, T3F * T3G);
Chris@10 926 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T3F, T3E, T3D * T3G);
Chris@10 927 }
Chris@10 928 }
Chris@10 929 {
Chris@10 930 E T1q, T1w, T1u, T1y, T1o, T1t;
Chris@10 931 T1o = TX - TW;
Chris@10 932 T1q = T1o - T1p;
Chris@10 933 T1w = T1o + T1p;
Chris@10 934 T1t = T1d - T1a;
Chris@10 935 T1u = T1s + T1t;
Chris@10 936 T1y = T1t - T1s;
Chris@10 937 {
Chris@10 938 E T1n, T1r, T1v, T1x;
Chris@10 939 T1n = W[2];
Chris@10 940 T1r = W[3];
Chris@10 941 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1n, T1q, T1r * T1u);
Chris@10 942 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1r, T1q, T1n * T1u);
Chris@10 943 T1v = W[4];
Chris@10 944 T1x = W[5];
Chris@10 945 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
Chris@10 946 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
Chris@10 947 }
Chris@10 948 }
Chris@10 949 {
Chris@10 950 E T16, T1k, T1i, T1m, TY, T1e;
Chris@10 951 TY = TW + TX;
Chris@10 952 T16 = TY + T15;
Chris@10 953 T1k = TY - T15;
Chris@10 954 T1e = T1a + T1d;
Chris@10 955 T1i = T1e - T1h;
Chris@10 956 T1m = T1h + T1e;
Chris@10 957 {
Chris@10 958 E TV, T17, T1j, T1l;
Chris@10 959 TV = W[0];
Chris@10 960 T17 = W[1];
Chris@10 961 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TV, T16, T17 * T1i);
Chris@10 962 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T17, T16, TV * T1i);
Chris@10 963 T1j = W[6];
Chris@10 964 T1l = W[7];
Chris@10 965 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1j, T1k, T1l * T1m);
Chris@10 966 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1l, T1k, T1j * T1m);
Chris@10 967 }
Chris@10 968 }
Chris@10 969 }
Chris@10 970 }
Chris@10 971 }
Chris@10 972
Chris@10 973 static const tw_instr twinstr[] = {
Chris@10 974 {TW_FULL, 0, 5},
Chris@10 975 {TW_NEXT, 1, 0}
Chris@10 976 };
Chris@10 977
Chris@10 978 static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, {130, 70, 70, 0}, 0, 0, 0 };
Chris@10 979
Chris@10 980 void X(codelet_q1_5) (planner *p) {
Chris@10 981 X(kdft_difsq_register) (p, q1_5, &desc);
Chris@10 982 }
Chris@10 983 #endif /* HAVE_FMA */