annotate src/fftw-3.3.3/dft/scalar/codelets/n1_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:46 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include n.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 208 FP additions, 72 FP multiplications,
Chris@10 32 * (or, 136 additions, 0 multiplications, 72 fused multiply/add),
Chris@10 33 * 86 stack variables, 4 constants, and 80 memory accesses
Chris@10 34 */
Chris@10 35 #include "n.h"
Chris@10 36
Chris@10 37 static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 43 {
Chris@10 44 INT i;
Chris@10 45 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
Chris@10 46 E T1Y, T1Z, T1W, T1V;
Chris@10 47 {
Chris@10 48 E T1d, TP, TD, T7, T3b, T2N, T2f, T1R, T2U, TB, T2P, T2A, T3d, T37, T3j;
Chris@10 49 E TJ, T2n, T1b, T1T, T1y, T2b, T2h, T1j, T2V, Tm, T2O, T2H, T3c, T34, T1e;
Chris@10 50 E T1f, T3i, TG, T2m, T10, T1S, T1J, T28, T2g;
Chris@10 51 {
Chris@10 52 E T4, T1N, T3, T2L, TN, T5, T1O, T1P, T1h, T1i;
Chris@10 53 {
Chris@10 54 E T1, T2, TL, TM;
Chris@10 55 T1 = ri[0];
Chris@10 56 T2 = ri[WS(is, 10)];
Chris@10 57 TL = ii[0];
Chris@10 58 TM = ii[WS(is, 10)];
Chris@10 59 T4 = ri[WS(is, 5)];
Chris@10 60 T1N = T1 - T2;
Chris@10 61 T3 = T1 + T2;
Chris@10 62 T2L = TL + TM;
Chris@10 63 TN = TL - TM;
Chris@10 64 T5 = ri[WS(is, 15)];
Chris@10 65 T1O = ii[WS(is, 5)];
Chris@10 66 T1P = ii[WS(is, 15)];
Chris@10 67 }
Chris@10 68 {
Chris@10 69 E T1o, Tp, T2u, T13, T14, Ts, T2v, T1r, Tx, T1t, Tw, T2x, T18, Ty, T1u;
Chris@10 70 E T1v;
Chris@10 71 {
Chris@10 72 E Tq, Tr, T1p, T1q;
Chris@10 73 {
Chris@10 74 E Tn, To, T11, T12;
Chris@10 75 Tn = ri[WS(is, 8)];
Chris@10 76 {
Chris@10 77 E TO, T6, T2M, T1Q;
Chris@10 78 TO = T4 - T5;
Chris@10 79 T6 = T4 + T5;
Chris@10 80 T2M = T1O + T1P;
Chris@10 81 T1Q = T1O - T1P;
Chris@10 82 T1d = TO + TN;
Chris@10 83 TP = TN - TO;
Chris@10 84 TD = T3 + T6;
Chris@10 85 T7 = T3 - T6;
Chris@10 86 T3b = T2L + T2M;
Chris@10 87 T2N = T2L - T2M;
Chris@10 88 T2f = T1N + T1Q;
Chris@10 89 T1R = T1N - T1Q;
Chris@10 90 To = ri[WS(is, 18)];
Chris@10 91 }
Chris@10 92 T11 = ii[WS(is, 8)];
Chris@10 93 T12 = ii[WS(is, 18)];
Chris@10 94 Tq = ri[WS(is, 13)];
Chris@10 95 T1o = Tn - To;
Chris@10 96 Tp = Tn + To;
Chris@10 97 T2u = T11 + T12;
Chris@10 98 T13 = T11 - T12;
Chris@10 99 Tr = ri[WS(is, 3)];
Chris@10 100 T1p = ii[WS(is, 13)];
Chris@10 101 T1q = ii[WS(is, 3)];
Chris@10 102 }
Chris@10 103 {
Chris@10 104 E Tu, Tv, T16, T17;
Chris@10 105 Tu = ri[WS(is, 12)];
Chris@10 106 T14 = Tq - Tr;
Chris@10 107 Ts = Tq + Tr;
Chris@10 108 T2v = T1p + T1q;
Chris@10 109 T1r = T1p - T1q;
Chris@10 110 Tv = ri[WS(is, 2)];
Chris@10 111 T16 = ii[WS(is, 12)];
Chris@10 112 T17 = ii[WS(is, 2)];
Chris@10 113 Tx = ri[WS(is, 17)];
Chris@10 114 T1t = Tu - Tv;
Chris@10 115 Tw = Tu + Tv;
Chris@10 116 T2x = T16 + T17;
Chris@10 117 T18 = T16 - T17;
Chris@10 118 Ty = ri[WS(is, 7)];
Chris@10 119 T1u = ii[WS(is, 17)];
Chris@10 120 T1v = ii[WS(is, 7)];
Chris@10 121 }
Chris@10 122 }
Chris@10 123 {
Chris@10 124 E TH, T19, T1w, TI;
Chris@10 125 {
Chris@10 126 E Tt, T2w, T35, TA, T2z, T36, Tz, T2y;
Chris@10 127 TH = Tp + Ts;
Chris@10 128 Tt = Tp - Ts;
Chris@10 129 T19 = Tx - Ty;
Chris@10 130 Tz = Tx + Ty;
Chris@10 131 T2y = T1u + T1v;
Chris@10 132 T1w = T1u - T1v;
Chris@10 133 T2w = T2u - T2v;
Chris@10 134 T35 = T2u + T2v;
Chris@10 135 TI = Tw + Tz;
Chris@10 136 TA = Tw - Tz;
Chris@10 137 T2z = T2x - T2y;
Chris@10 138 T36 = T2x + T2y;
Chris@10 139 T2U = Tt - TA;
Chris@10 140 TB = Tt + TA;
Chris@10 141 T2P = T2w + T2z;
Chris@10 142 T2A = T2w - T2z;
Chris@10 143 T3d = T35 + T36;
Chris@10 144 T37 = T35 - T36;
Chris@10 145 }
Chris@10 146 {
Chris@10 147 E T1s, T29, T1x, T2a, T15, T1a;
Chris@10 148 T15 = T13 - T14;
Chris@10 149 T1h = T14 + T13;
Chris@10 150 T1i = T19 + T18;
Chris@10 151 T1a = T18 - T19;
Chris@10 152 T1s = T1o - T1r;
Chris@10 153 T29 = T1o + T1r;
Chris@10 154 T3j = TH - TI;
Chris@10 155 TJ = TH + TI;
Chris@10 156 T1x = T1t - T1w;
Chris@10 157 T2a = T1t + T1w;
Chris@10 158 T2n = T15 - T1a;
Chris@10 159 T1b = T15 + T1a;
Chris@10 160 T1T = T1s + T1x;
Chris@10 161 T1y = T1s - T1x;
Chris@10 162 T2b = T29 - T2a;
Chris@10 163 T2h = T29 + T2a;
Chris@10 164 }
Chris@10 165 }
Chris@10 166 }
Chris@10 167 {
Chris@10 168 E Ta, T1z, T2B, TS, TT, Td, T2C, T1C, Ti, T1E, Th, T2E, TX, Tj, T1F;
Chris@10 169 E T1G;
Chris@10 170 {
Chris@10 171 E Tb, Tc, T1A, T1B;
Chris@10 172 {
Chris@10 173 E TQ, TR, T8, T9;
Chris@10 174 T8 = ri[WS(is, 4)];
Chris@10 175 T9 = ri[WS(is, 14)];
Chris@10 176 T1j = T1h + T1i;
Chris@10 177 T1Y = T1h - T1i;
Chris@10 178 TQ = ii[WS(is, 4)];
Chris@10 179 TR = ii[WS(is, 14)];
Chris@10 180 Ta = T8 + T9;
Chris@10 181 T1z = T8 - T9;
Chris@10 182 Tb = ri[WS(is, 9)];
Chris@10 183 T2B = TQ + TR;
Chris@10 184 TS = TQ - TR;
Chris@10 185 Tc = ri[WS(is, 19)];
Chris@10 186 T1A = ii[WS(is, 9)];
Chris@10 187 T1B = ii[WS(is, 19)];
Chris@10 188 }
Chris@10 189 {
Chris@10 190 E Tf, Tg, TV, TW;
Chris@10 191 Tf = ri[WS(is, 16)];
Chris@10 192 TT = Tb - Tc;
Chris@10 193 Td = Tb + Tc;
Chris@10 194 T2C = T1A + T1B;
Chris@10 195 T1C = T1A - T1B;
Chris@10 196 Tg = ri[WS(is, 6)];
Chris@10 197 TV = ii[WS(is, 16)];
Chris@10 198 TW = ii[WS(is, 6)];
Chris@10 199 Ti = ri[WS(is, 1)];
Chris@10 200 T1E = Tf - Tg;
Chris@10 201 Th = Tf + Tg;
Chris@10 202 T2E = TV + TW;
Chris@10 203 TX = TV - TW;
Chris@10 204 Tj = ri[WS(is, 11)];
Chris@10 205 T1F = ii[WS(is, 1)];
Chris@10 206 T1G = ii[WS(is, 11)];
Chris@10 207 }
Chris@10 208 }
Chris@10 209 {
Chris@10 210 E TE, TY, T1H, TF;
Chris@10 211 {
Chris@10 212 E Te, T2D, T32, Tl, T2G, T33, Tk, T2F;
Chris@10 213 TE = Ta + Td;
Chris@10 214 Te = Ta - Td;
Chris@10 215 TY = Ti - Tj;
Chris@10 216 Tk = Ti + Tj;
Chris@10 217 T2F = T1F + T1G;
Chris@10 218 T1H = T1F - T1G;
Chris@10 219 T2D = T2B - T2C;
Chris@10 220 T32 = T2B + T2C;
Chris@10 221 TF = Th + Tk;
Chris@10 222 Tl = Th - Tk;
Chris@10 223 T2G = T2E - T2F;
Chris@10 224 T33 = T2E + T2F;
Chris@10 225 T2V = Te - Tl;
Chris@10 226 Tm = Te + Tl;
Chris@10 227 T2O = T2D + T2G;
Chris@10 228 T2H = T2D - T2G;
Chris@10 229 T3c = T32 + T33;
Chris@10 230 T34 = T32 - T33;
Chris@10 231 }
Chris@10 232 {
Chris@10 233 E T1D, T26, T1I, T27, TU, TZ;
Chris@10 234 TU = TS - TT;
Chris@10 235 T1e = TT + TS;
Chris@10 236 T1f = TY + TX;
Chris@10 237 TZ = TX - TY;
Chris@10 238 T1D = T1z - T1C;
Chris@10 239 T26 = T1z + T1C;
Chris@10 240 T3i = TE - TF;
Chris@10 241 TG = TE + TF;
Chris@10 242 T1I = T1E - T1H;
Chris@10 243 T27 = T1E + T1H;
Chris@10 244 T2m = TU - TZ;
Chris@10 245 T10 = TU + TZ;
Chris@10 246 T1S = T1D + T1I;
Chris@10 247 T1J = T1D - T1I;
Chris@10 248 T28 = T26 - T27;
Chris@10 249 T2g = T26 + T27;
Chris@10 250 }
Chris@10 251 }
Chris@10 252 }
Chris@10 253 }
Chris@10 254 {
Chris@10 255 E T1g, T3g, T3f, T2S, T2R, T2k, T2j;
Chris@10 256 {
Chris@10 257 E T2s, T2r, TC, T2Q;
Chris@10 258 T2s = Tm - TB;
Chris@10 259 TC = Tm + TB;
Chris@10 260 T1g = T1e + T1f;
Chris@10 261 T1Z = T1e - T1f;
Chris@10 262 T2r = FNMS(KP250000000, TC, T7);
Chris@10 263 ro[WS(os, 10)] = T7 + TC;
Chris@10 264 T2Q = T2O + T2P;
Chris@10 265 T2S = T2O - T2P;
Chris@10 266 {
Chris@10 267 E T2K, T2I, T2t, T2J;
Chris@10 268 T2K = FMA(KP618033988, T2A, T2H);
Chris@10 269 T2I = FNMS(KP618033988, T2H, T2A);
Chris@10 270 T2t = FNMS(KP559016994, T2s, T2r);
Chris@10 271 T2J = FMA(KP559016994, T2s, T2r);
Chris@10 272 ro[WS(os, 18)] = FMA(KP951056516, T2I, T2t);
Chris@10 273 ro[WS(os, 2)] = FNMS(KP951056516, T2I, T2t);
Chris@10 274 ro[WS(os, 6)] = FMA(KP951056516, T2K, T2J);
Chris@10 275 ro[WS(os, 14)] = FNMS(KP951056516, T2K, T2J);
Chris@10 276 T2R = FNMS(KP250000000, T2Q, T2N);
Chris@10 277 }
Chris@10 278 io[WS(os, 10)] = T2N + T2Q;
Chris@10 279 }
Chris@10 280 {
Chris@10 281 E T30, T2Z, TK, T3e;
Chris@10 282 TK = TG + TJ;
Chris@10 283 T30 = TG - TJ;
Chris@10 284 {
Chris@10 285 E T2T, T2X, T2Y, T2W;
Chris@10 286 T2T = FNMS(KP559016994, T2S, T2R);
Chris@10 287 T2X = FMA(KP559016994, T2S, T2R);
Chris@10 288 T2Y = FMA(KP618033988, T2U, T2V);
Chris@10 289 T2W = FNMS(KP618033988, T2V, T2U);
Chris@10 290 io[WS(os, 14)] = FMA(KP951056516, T2Y, T2X);
Chris@10 291 io[WS(os, 6)] = FNMS(KP951056516, T2Y, T2X);
Chris@10 292 io[WS(os, 18)] = FNMS(KP951056516, T2W, T2T);
Chris@10 293 io[WS(os, 2)] = FMA(KP951056516, T2W, T2T);
Chris@10 294 T2Z = FNMS(KP250000000, TK, TD);
Chris@10 295 }
Chris@10 296 ro[0] = TD + TK;
Chris@10 297 T3e = T3c + T3d;
Chris@10 298 T3g = T3c - T3d;
Chris@10 299 {
Chris@10 300 E T31, T39, T3a, T38;
Chris@10 301 T31 = FMA(KP559016994, T30, T2Z);
Chris@10 302 T39 = FNMS(KP559016994, T30, T2Z);
Chris@10 303 T3a = FNMS(KP618033988, T34, T37);
Chris@10 304 T38 = FMA(KP618033988, T37, T34);
Chris@10 305 ro[WS(os, 8)] = FMA(KP951056516, T3a, T39);
Chris@10 306 ro[WS(os, 12)] = FNMS(KP951056516, T3a, T39);
Chris@10 307 ro[WS(os, 16)] = FMA(KP951056516, T38, T31);
Chris@10 308 ro[WS(os, 4)] = FNMS(KP951056516, T38, T31);
Chris@10 309 T3f = FNMS(KP250000000, T3e, T3b);
Chris@10 310 }
Chris@10 311 io[0] = T3b + T3e;
Chris@10 312 }
Chris@10 313 {
Chris@10 314 E T24, T23, T1c, T2i;
Chris@10 315 T1c = T10 + T1b;
Chris@10 316 T24 = T10 - T1b;
Chris@10 317 {
Chris@10 318 E T3h, T3l, T3m, T3k;
Chris@10 319 T3h = FMA(KP559016994, T3g, T3f);
Chris@10 320 T3l = FNMS(KP559016994, T3g, T3f);
Chris@10 321 T3m = FNMS(KP618033988, T3i, T3j);
Chris@10 322 T3k = FMA(KP618033988, T3j, T3i);
Chris@10 323 io[WS(os, 12)] = FMA(KP951056516, T3m, T3l);
Chris@10 324 io[WS(os, 8)] = FNMS(KP951056516, T3m, T3l);
Chris@10 325 io[WS(os, 16)] = FNMS(KP951056516, T3k, T3h);
Chris@10 326 io[WS(os, 4)] = FMA(KP951056516, T3k, T3h);
Chris@10 327 T23 = FNMS(KP250000000, T1c, TP);
Chris@10 328 }
Chris@10 329 io[WS(os, 5)] = TP + T1c;
Chris@10 330 T2i = T2g + T2h;
Chris@10 331 T2k = T2g - T2h;
Chris@10 332 {
Chris@10 333 E T25, T2d, T2e, T2c;
Chris@10 334 T25 = FMA(KP559016994, T24, T23);
Chris@10 335 T2d = FNMS(KP559016994, T24, T23);
Chris@10 336 T2e = FNMS(KP618033988, T28, T2b);
Chris@10 337 T2c = FMA(KP618033988, T2b, T28);
Chris@10 338 io[WS(os, 17)] = FMA(KP951056516, T2e, T2d);
Chris@10 339 io[WS(os, 13)] = FNMS(KP951056516, T2e, T2d);
Chris@10 340 io[WS(os, 9)] = FMA(KP951056516, T2c, T25);
Chris@10 341 io[WS(os, 1)] = FNMS(KP951056516, T2c, T25);
Chris@10 342 T2j = FNMS(KP250000000, T2i, T2f);
Chris@10 343 }
Chris@10 344 ro[WS(os, 5)] = T2f + T2i;
Chris@10 345 }
Chris@10 346 {
Chris@10 347 E T1m, T1l, T1k, T1U;
Chris@10 348 T1k = T1g + T1j;
Chris@10 349 T1m = T1g - T1j;
Chris@10 350 {
Chris@10 351 E T2l, T2p, T2q, T2o;
Chris@10 352 T2l = FMA(KP559016994, T2k, T2j);
Chris@10 353 T2p = FNMS(KP559016994, T2k, T2j);
Chris@10 354 T2q = FNMS(KP618033988, T2m, T2n);
Chris@10 355 T2o = FMA(KP618033988, T2n, T2m);
Chris@10 356 ro[WS(os, 17)] = FNMS(KP951056516, T2q, T2p);
Chris@10 357 ro[WS(os, 13)] = FMA(KP951056516, T2q, T2p);
Chris@10 358 ro[WS(os, 9)] = FNMS(KP951056516, T2o, T2l);
Chris@10 359 ro[WS(os, 1)] = FMA(KP951056516, T2o, T2l);
Chris@10 360 T1l = FNMS(KP250000000, T1k, T1d);
Chris@10 361 }
Chris@10 362 io[WS(os, 15)] = T1d + T1k;
Chris@10 363 T1U = T1S + T1T;
Chris@10 364 T1W = T1S - T1T;
Chris@10 365 {
Chris@10 366 E T1n, T1L, T1M, T1K;
Chris@10 367 T1n = FNMS(KP559016994, T1m, T1l);
Chris@10 368 T1L = FMA(KP559016994, T1m, T1l);
Chris@10 369 T1M = FMA(KP618033988, T1y, T1J);
Chris@10 370 T1K = FNMS(KP618033988, T1J, T1y);
Chris@10 371 io[WS(os, 19)] = FMA(KP951056516, T1M, T1L);
Chris@10 372 io[WS(os, 11)] = FNMS(KP951056516, T1M, T1L);
Chris@10 373 io[WS(os, 7)] = FMA(KP951056516, T1K, T1n);
Chris@10 374 io[WS(os, 3)] = FNMS(KP951056516, T1K, T1n);
Chris@10 375 T1V = FNMS(KP250000000, T1U, T1R);
Chris@10 376 }
Chris@10 377 ro[WS(os, 15)] = T1R + T1U;
Chris@10 378 }
Chris@10 379 }
Chris@10 380 }
Chris@10 381 {
Chris@10 382 E T21, T1X, T20, T22;
Chris@10 383 T21 = FMA(KP559016994, T1W, T1V);
Chris@10 384 T1X = FNMS(KP559016994, T1W, T1V);
Chris@10 385 T20 = FNMS(KP618033988, T1Z, T1Y);
Chris@10 386 T22 = FMA(KP618033988, T1Y, T1Z);
Chris@10 387 ro[WS(os, 19)] = FNMS(KP951056516, T22, T21);
Chris@10 388 ro[WS(os, 11)] = FMA(KP951056516, T22, T21);
Chris@10 389 ro[WS(os, 7)] = FNMS(KP951056516, T20, T1X);
Chris@10 390 ro[WS(os, 3)] = FMA(KP951056516, T20, T1X);
Chris@10 391 }
Chris@10 392 }
Chris@10 393 }
Chris@10 394 }
Chris@10 395
Chris@10 396 static const kdft_desc desc = { 20, "n1_20", {136, 0, 72, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 397
Chris@10 398 void X(codelet_n1_20) (planner *p) {
Chris@10 399 X(kdft_register) (p, n1_20, &desc);
Chris@10 400 }
Chris@10 401
Chris@10 402 #else /* HAVE_FMA */
Chris@10 403
Chris@10 404 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include n.h */
Chris@10 405
Chris@10 406 /*
Chris@10 407 * This function contains 208 FP additions, 48 FP multiplications,
Chris@10 408 * (or, 184 additions, 24 multiplications, 24 fused multiply/add),
Chris@10 409 * 81 stack variables, 4 constants, and 80 memory accesses
Chris@10 410 */
Chris@10 411 #include "n.h"
Chris@10 412
Chris@10 413 static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 414 {
Chris@10 415 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 416 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 417 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 418 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 419 {
Chris@10 420 INT i;
Chris@10 421 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
Chris@10 422 E T7, T2Q, T3h, TD, TP, T1U, T2l, T1d, Tt, TA, TB, T2w, T2z, T2S, T35;
Chris@10 423 E T36, T3f, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1W, T29, T2a, T2j, T1h;
Chris@10 424 E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2R, T32, T33, T3e, TE, TF, TG, TU;
Chris@10 425 E TZ, T10, T1D, T1I, T1V, T26, T27, T2i, T1e, T1f, T1g;
Chris@10 426 {
Chris@10 427 E T3, T1Q, TN, T2O, T6, TO, T1T, T2P;
Chris@10 428 {
Chris@10 429 E T1, T2, TL, TM;
Chris@10 430 T1 = ri[0];
Chris@10 431 T2 = ri[WS(is, 10)];
Chris@10 432 T3 = T1 + T2;
Chris@10 433 T1Q = T1 - T2;
Chris@10 434 TL = ii[0];
Chris@10 435 TM = ii[WS(is, 10)];
Chris@10 436 TN = TL - TM;
Chris@10 437 T2O = TL + TM;
Chris@10 438 }
Chris@10 439 {
Chris@10 440 E T4, T5, T1R, T1S;
Chris@10 441 T4 = ri[WS(is, 5)];
Chris@10 442 T5 = ri[WS(is, 15)];
Chris@10 443 T6 = T4 + T5;
Chris@10 444 TO = T4 - T5;
Chris@10 445 T1R = ii[WS(is, 5)];
Chris@10 446 T1S = ii[WS(is, 15)];
Chris@10 447 T1T = T1R - T1S;
Chris@10 448 T2P = T1R + T1S;
Chris@10 449 }
Chris@10 450 T7 = T3 - T6;
Chris@10 451 T2Q = T2O - T2P;
Chris@10 452 T3h = T2O + T2P;
Chris@10 453 TD = T3 + T6;
Chris@10 454 TP = TN - TO;
Chris@10 455 T1U = T1Q - T1T;
Chris@10 456 T2l = T1Q + T1T;
Chris@10 457 T1d = TO + TN;
Chris@10 458 }
Chris@10 459 {
Chris@10 460 E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
Chris@10 461 E T2y;
Chris@10 462 {
Chris@10 463 E Tn, To, T11, T12;
Chris@10 464 Tn = ri[WS(is, 8)];
Chris@10 465 To = ri[WS(is, 18)];
Chris@10 466 Tp = Tn + To;
Chris@10 467 T1o = Tn - To;
Chris@10 468 T11 = ii[WS(is, 8)];
Chris@10 469 T12 = ii[WS(is, 18)];
Chris@10 470 T13 = T11 - T12;
Chris@10 471 T2u = T11 + T12;
Chris@10 472 }
Chris@10 473 {
Chris@10 474 E Tq, Tr, T1p, T1q;
Chris@10 475 Tq = ri[WS(is, 13)];
Chris@10 476 Tr = ri[WS(is, 3)];
Chris@10 477 Ts = Tq + Tr;
Chris@10 478 T14 = Tq - Tr;
Chris@10 479 T1p = ii[WS(is, 13)];
Chris@10 480 T1q = ii[WS(is, 3)];
Chris@10 481 T1r = T1p - T1q;
Chris@10 482 T2v = T1p + T1q;
Chris@10 483 }
Chris@10 484 {
Chris@10 485 E Tu, Tv, T16, T17;
Chris@10 486 Tu = ri[WS(is, 12)];
Chris@10 487 Tv = ri[WS(is, 2)];
Chris@10 488 Tw = Tu + Tv;
Chris@10 489 T1t = Tu - Tv;
Chris@10 490 T16 = ii[WS(is, 12)];
Chris@10 491 T17 = ii[WS(is, 2)];
Chris@10 492 T18 = T16 - T17;
Chris@10 493 T2x = T16 + T17;
Chris@10 494 }
Chris@10 495 {
Chris@10 496 E Tx, Ty, T1u, T1v;
Chris@10 497 Tx = ri[WS(is, 17)];
Chris@10 498 Ty = ri[WS(is, 7)];
Chris@10 499 Tz = Tx + Ty;
Chris@10 500 T19 = Tx - Ty;
Chris@10 501 T1u = ii[WS(is, 17)];
Chris@10 502 T1v = ii[WS(is, 7)];
Chris@10 503 T1w = T1u - T1v;
Chris@10 504 T2y = T1u + T1v;
Chris@10 505 }
Chris@10 506 Tt = Tp - Ts;
Chris@10 507 TA = Tw - Tz;
Chris@10 508 TB = Tt + TA;
Chris@10 509 T2w = T2u - T2v;
Chris@10 510 T2z = T2x - T2y;
Chris@10 511 T2S = T2w + T2z;
Chris@10 512 T35 = T2u + T2v;
Chris@10 513 T36 = T2x + T2y;
Chris@10 514 T3f = T35 + T36;
Chris@10 515 TH = Tp + Ts;
Chris@10 516 TI = Tw + Tz;
Chris@10 517 TJ = TH + TI;
Chris@10 518 T15 = T13 - T14;
Chris@10 519 T1a = T18 - T19;
Chris@10 520 T1b = T15 + T1a;
Chris@10 521 T1s = T1o - T1r;
Chris@10 522 T1x = T1t - T1w;
Chris@10 523 T1W = T1s + T1x;
Chris@10 524 T29 = T1o + T1r;
Chris@10 525 T2a = T1t + T1w;
Chris@10 526 T2j = T29 + T2a;
Chris@10 527 T1h = T14 + T13;
Chris@10 528 T1i = T19 + T18;
Chris@10 529 T1j = T1h + T1i;
Chris@10 530 }
Chris@10 531 {
Chris@10 532 E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
Chris@10 533 E T2F;
Chris@10 534 {
Chris@10 535 E T8, T9, TQ, TR;
Chris@10 536 T8 = ri[WS(is, 4)];
Chris@10 537 T9 = ri[WS(is, 14)];
Chris@10 538 Ta = T8 + T9;
Chris@10 539 T1z = T8 - T9;
Chris@10 540 TQ = ii[WS(is, 4)];
Chris@10 541 TR = ii[WS(is, 14)];
Chris@10 542 TS = TQ - TR;
Chris@10 543 T2B = TQ + TR;
Chris@10 544 }
Chris@10 545 {
Chris@10 546 E Tb, Tc, T1A, T1B;
Chris@10 547 Tb = ri[WS(is, 9)];
Chris@10 548 Tc = ri[WS(is, 19)];
Chris@10 549 Td = Tb + Tc;
Chris@10 550 TT = Tb - Tc;
Chris@10 551 T1A = ii[WS(is, 9)];
Chris@10 552 T1B = ii[WS(is, 19)];
Chris@10 553 T1C = T1A - T1B;
Chris@10 554 T2C = T1A + T1B;
Chris@10 555 }
Chris@10 556 {
Chris@10 557 E Tf, Tg, TV, TW;
Chris@10 558 Tf = ri[WS(is, 16)];
Chris@10 559 Tg = ri[WS(is, 6)];
Chris@10 560 Th = Tf + Tg;
Chris@10 561 T1E = Tf - Tg;
Chris@10 562 TV = ii[WS(is, 16)];
Chris@10 563 TW = ii[WS(is, 6)];
Chris@10 564 TX = TV - TW;
Chris@10 565 T2E = TV + TW;
Chris@10 566 }
Chris@10 567 {
Chris@10 568 E Ti, Tj, T1F, T1G;
Chris@10 569 Ti = ri[WS(is, 1)];
Chris@10 570 Tj = ri[WS(is, 11)];
Chris@10 571 Tk = Ti + Tj;
Chris@10 572 TY = Ti - Tj;
Chris@10 573 T1F = ii[WS(is, 1)];
Chris@10 574 T1G = ii[WS(is, 11)];
Chris@10 575 T1H = T1F - T1G;
Chris@10 576 T2F = T1F + T1G;
Chris@10 577 }
Chris@10 578 Te = Ta - Td;
Chris@10 579 Tl = Th - Tk;
Chris@10 580 Tm = Te + Tl;
Chris@10 581 T2D = T2B - T2C;
Chris@10 582 T2G = T2E - T2F;
Chris@10 583 T2R = T2D + T2G;
Chris@10 584 T32 = T2B + T2C;
Chris@10 585 T33 = T2E + T2F;
Chris@10 586 T3e = T32 + T33;
Chris@10 587 TE = Ta + Td;
Chris@10 588 TF = Th + Tk;
Chris@10 589 TG = TE + TF;
Chris@10 590 TU = TS - TT;
Chris@10 591 TZ = TX - TY;
Chris@10 592 T10 = TU + TZ;
Chris@10 593 T1D = T1z - T1C;
Chris@10 594 T1I = T1E - T1H;
Chris@10 595 T1V = T1D + T1I;
Chris@10 596 T26 = T1z + T1C;
Chris@10 597 T27 = T1E + T1H;
Chris@10 598 T2i = T26 + T27;
Chris@10 599 T1e = TT + TS;
Chris@10 600 T1f = TY + TX;
Chris@10 601 T1g = T1e + T1f;
Chris@10 602 }
Chris@10 603 {
Chris@10 604 E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
Chris@10 605 T2s = KP559016994 * (Tm - TB);
Chris@10 606 TC = Tm + TB;
Chris@10 607 T2r = FNMS(KP250000000, TC, T7);
Chris@10 608 T2A = T2w - T2z;
Chris@10 609 T2H = T2D - T2G;
Chris@10 610 T2I = FNMS(KP587785252, T2H, KP951056516 * T2A);
Chris@10 611 T2K = FMA(KP951056516, T2H, KP587785252 * T2A);
Chris@10 612 ro[WS(os, 10)] = T7 + TC;
Chris@10 613 T2J = T2s + T2r;
Chris@10 614 ro[WS(os, 14)] = T2J - T2K;
Chris@10 615 ro[WS(os, 6)] = T2J + T2K;
Chris@10 616 T2t = T2r - T2s;
Chris@10 617 ro[WS(os, 2)] = T2t - T2I;
Chris@10 618 ro[WS(os, 18)] = T2t + T2I;
Chris@10 619 }
Chris@10 620 {
Chris@10 621 E T2V, T2T, T2U, T2N, T2Y, T2L, T2M, T2X, T2W;
Chris@10 622 T2V = KP559016994 * (T2R - T2S);
Chris@10 623 T2T = T2R + T2S;
Chris@10 624 T2U = FNMS(KP250000000, T2T, T2Q);
Chris@10 625 T2L = Tt - TA;
Chris@10 626 T2M = Te - Tl;
Chris@10 627 T2N = FNMS(KP587785252, T2M, KP951056516 * T2L);
Chris@10 628 T2Y = FMA(KP951056516, T2M, KP587785252 * T2L);
Chris@10 629 io[WS(os, 10)] = T2Q + T2T;
Chris@10 630 T2X = T2V + T2U;
Chris@10 631 io[WS(os, 6)] = T2X - T2Y;
Chris@10 632 io[WS(os, 14)] = T2Y + T2X;
Chris@10 633 T2W = T2U - T2V;
Chris@10 634 io[WS(os, 2)] = T2N + T2W;
Chris@10 635 io[WS(os, 18)] = T2W - T2N;
Chris@10 636 }
Chris@10 637 {
Chris@10 638 E T2Z, TK, T30, T38, T3a, T34, T37, T39, T31;
Chris@10 639 T2Z = KP559016994 * (TG - TJ);
Chris@10 640 TK = TG + TJ;
Chris@10 641 T30 = FNMS(KP250000000, TK, TD);
Chris@10 642 T34 = T32 - T33;
Chris@10 643 T37 = T35 - T36;
Chris@10 644 T38 = FMA(KP951056516, T34, KP587785252 * T37);
Chris@10 645 T3a = FNMS(KP587785252, T34, KP951056516 * T37);
Chris@10 646 ro[0] = TD + TK;
Chris@10 647 T39 = T30 - T2Z;
Chris@10 648 ro[WS(os, 12)] = T39 - T3a;
Chris@10 649 ro[WS(os, 8)] = T39 + T3a;
Chris@10 650 T31 = T2Z + T30;
Chris@10 651 ro[WS(os, 4)] = T31 - T38;
Chris@10 652 ro[WS(os, 16)] = T31 + T38;
Chris@10 653 }
Chris@10 654 {
Chris@10 655 E T3g, T3i, T3j, T3d, T3m, T3b, T3c, T3l, T3k;
Chris@10 656 T3g = KP559016994 * (T3e - T3f);
Chris@10 657 T3i = T3e + T3f;
Chris@10 658 T3j = FNMS(KP250000000, T3i, T3h);
Chris@10 659 T3b = TE - TF;
Chris@10 660 T3c = TH - TI;
Chris@10 661 T3d = FMA(KP951056516, T3b, KP587785252 * T3c);
Chris@10 662 T3m = FNMS(KP587785252, T3b, KP951056516 * T3c);
Chris@10 663 io[0] = T3h + T3i;
Chris@10 664 T3l = T3j - T3g;
Chris@10 665 io[WS(os, 8)] = T3l - T3m;
Chris@10 666 io[WS(os, 12)] = T3m + T3l;
Chris@10 667 T3k = T3g + T3j;
Chris@10 668 io[WS(os, 4)] = T3d + T3k;
Chris@10 669 io[WS(os, 16)] = T3k - T3d;
Chris@10 670 }
Chris@10 671 {
Chris@10 672 E T23, T1c, T24, T2c, T2e, T28, T2b, T2d, T25;
Chris@10 673 T23 = KP559016994 * (T10 - T1b);
Chris@10 674 T1c = T10 + T1b;
Chris@10 675 T24 = FNMS(KP250000000, T1c, TP);
Chris@10 676 T28 = T26 - T27;
Chris@10 677 T2b = T29 - T2a;
Chris@10 678 T2c = FMA(KP951056516, T28, KP587785252 * T2b);
Chris@10 679 T2e = FNMS(KP587785252, T28, KP951056516 * T2b);
Chris@10 680 io[WS(os, 5)] = TP + T1c;
Chris@10 681 T2d = T24 - T23;
Chris@10 682 io[WS(os, 13)] = T2d - T2e;
Chris@10 683 io[WS(os, 17)] = T2d + T2e;
Chris@10 684 T25 = T23 + T24;
Chris@10 685 io[WS(os, 1)] = T25 - T2c;
Chris@10 686 io[WS(os, 9)] = T25 + T2c;
Chris@10 687 }
Chris@10 688 {
Chris@10 689 E T2k, T2m, T2n, T2h, T2p, T2f, T2g, T2q, T2o;
Chris@10 690 T2k = KP559016994 * (T2i - T2j);
Chris@10 691 T2m = T2i + T2j;
Chris@10 692 T2n = FNMS(KP250000000, T2m, T2l);
Chris@10 693 T2f = TU - TZ;
Chris@10 694 T2g = T15 - T1a;
Chris@10 695 T2h = FMA(KP951056516, T2f, KP587785252 * T2g);
Chris@10 696 T2p = FNMS(KP587785252, T2f, KP951056516 * T2g);
Chris@10 697 ro[WS(os, 5)] = T2l + T2m;
Chris@10 698 T2q = T2n - T2k;
Chris@10 699 ro[WS(os, 13)] = T2p + T2q;
Chris@10 700 ro[WS(os, 17)] = T2q - T2p;
Chris@10 701 T2o = T2k + T2n;
Chris@10 702 ro[WS(os, 1)] = T2h + T2o;
Chris@10 703 ro[WS(os, 9)] = T2o - T2h;
Chris@10 704 }
Chris@10 705 {
Chris@10 706 E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
Chris@10 707 T1m = KP559016994 * (T1g - T1j);
Chris@10 708 T1k = T1g + T1j;
Chris@10 709 T1l = FNMS(KP250000000, T1k, T1d);
Chris@10 710 T1y = T1s - T1x;
Chris@10 711 T1J = T1D - T1I;
Chris@10 712 T1K = FNMS(KP587785252, T1J, KP951056516 * T1y);
Chris@10 713 T1M = FMA(KP951056516, T1J, KP587785252 * T1y);
Chris@10 714 io[WS(os, 15)] = T1d + T1k;
Chris@10 715 T1L = T1m + T1l;
Chris@10 716 io[WS(os, 11)] = T1L - T1M;
Chris@10 717 io[WS(os, 19)] = T1L + T1M;
Chris@10 718 T1n = T1l - T1m;
Chris@10 719 io[WS(os, 3)] = T1n - T1K;
Chris@10 720 io[WS(os, 7)] = T1n + T1K;
Chris@10 721 }
Chris@10 722 {
Chris@10 723 E T1Z, T1X, T1Y, T1P, T21, T1N, T1O, T22, T20;
Chris@10 724 T1Z = KP559016994 * (T1V - T1W);
Chris@10 725 T1X = T1V + T1W;
Chris@10 726 T1Y = FNMS(KP250000000, T1X, T1U);
Chris@10 727 T1N = T1h - T1i;
Chris@10 728 T1O = T1e - T1f;
Chris@10 729 T1P = FNMS(KP587785252, T1O, KP951056516 * T1N);
Chris@10 730 T21 = FMA(KP951056516, T1O, KP587785252 * T1N);
Chris@10 731 ro[WS(os, 15)] = T1U + T1X;
Chris@10 732 T22 = T1Z + T1Y;
Chris@10 733 ro[WS(os, 11)] = T21 + T22;
Chris@10 734 ro[WS(os, 19)] = T22 - T21;
Chris@10 735 T20 = T1Y - T1Z;
Chris@10 736 ro[WS(os, 3)] = T1P + T20;
Chris@10 737 ro[WS(os, 7)] = T20 - T1P;
Chris@10 738 }
Chris@10 739 }
Chris@10 740 }
Chris@10 741 }
Chris@10 742
Chris@10 743 static const kdft_desc desc = { 20, "n1_20", {184, 24, 24, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 744
Chris@10 745 void X(codelet_n1_20) (planner *p) {
Chris@10 746 X(kdft_register) (p, n1_20, &desc);
Chris@10 747 }
Chris@10 748
Chris@10 749 #endif /* HAVE_FMA */