annotate src/fftw-3.3.3/dft/scalar/codelets/n1_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:43 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include n.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 156 FP additions, 84 FP multiplications,
Chris@10 32 * (or, 72 additions, 0 multiplications, 84 fused multiply/add),
Chris@10 33 * 75 stack variables, 6 constants, and 60 memory accesses
Chris@10 34 */
Chris@10 35 #include "n.h"
Chris@10 36
Chris@10 37 static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 43 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 44 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 45 {
Chris@10 46 INT i;
Chris@10 47 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
Chris@10 48 E T1r, T1g, T14, T13;
Chris@10 49 {
Chris@10 50 E T5, T2l, Tx, TV, T1z, T1X, T2s, Tr, T24, TT, T2e, T2n, T1Z, T1Q, T1B;
Chris@10 51 E T11, T1H, TW, T2t, Tg, TX, T25, TI, T2h, T2m, T1Y, T1T, T1A;
Chris@10 52 {
Chris@10 53 E T1, T1v, T2, T3, Tu, Tv, TZ, T10;
Chris@10 54 T1 = ri[0];
Chris@10 55 T1v = ii[0];
Chris@10 56 T2 = ri[WS(is, 5)];
Chris@10 57 T3 = ri[WS(is, 10)];
Chris@10 58 Tu = ii[WS(is, 5)];
Chris@10 59 Tv = ii[WS(is, 10)];
Chris@10 60 {
Chris@10 61 E T1k, Tm, TM, TJ, Tl, T2c, T1j, T1m, TP, T1p, Tp, TQ;
Chris@10 62 {
Chris@10 63 E Th, T1h, TK, TL, Tk, Tn, To, T1i;
Chris@10 64 {
Chris@10 65 E Ti, Tj, T1y, T4;
Chris@10 66 Th = ri[WS(is, 6)];
Chris@10 67 T1y = T3 - T2;
Chris@10 68 T4 = T2 + T3;
Chris@10 69 {
Chris@10 70 E T1w, Tw, Tt, T1x;
Chris@10 71 T1w = Tu + Tv;
Chris@10 72 Tw = Tu - Tv;
Chris@10 73 Ti = ri[WS(is, 11)];
Chris@10 74 T5 = T1 + T4;
Chris@10 75 Tt = FNMS(KP500000000, T4, T1);
Chris@10 76 T2l = T1v + T1w;
Chris@10 77 T1x = FNMS(KP500000000, T1w, T1v);
Chris@10 78 Tx = FNMS(KP866025403, Tw, Tt);
Chris@10 79 TV = FMA(KP866025403, Tw, Tt);
Chris@10 80 T1z = FMA(KP866025403, T1y, T1x);
Chris@10 81 T1X = FNMS(KP866025403, T1y, T1x);
Chris@10 82 Tj = ri[WS(is, 1)];
Chris@10 83 }
Chris@10 84 T1h = ii[WS(is, 6)];
Chris@10 85 TK = ii[WS(is, 11)];
Chris@10 86 TL = ii[WS(is, 1)];
Chris@10 87 Tk = Ti + Tj;
Chris@10 88 T1k = Tj - Ti;
Chris@10 89 }
Chris@10 90 Tm = ri[WS(is, 9)];
Chris@10 91 TM = TK - TL;
Chris@10 92 T1i = TK + TL;
Chris@10 93 TJ = FNMS(KP500000000, Tk, Th);
Chris@10 94 Tl = Th + Tk;
Chris@10 95 Tn = ri[WS(is, 14)];
Chris@10 96 To = ri[WS(is, 4)];
Chris@10 97 T2c = T1h + T1i;
Chris@10 98 T1j = FNMS(KP500000000, T1i, T1h);
Chris@10 99 T1m = ii[WS(is, 9)];
Chris@10 100 TP = ii[WS(is, 14)];
Chris@10 101 T1p = To - Tn;
Chris@10 102 Tp = Tn + To;
Chris@10 103 TQ = ii[WS(is, 4)];
Chris@10 104 }
Chris@10 105 {
Chris@10 106 E TN, TS, T1o, T2d;
Chris@10 107 {
Chris@10 108 E TO, T1n, TR, Tq;
Chris@10 109 TN = FNMS(KP866025403, TM, TJ);
Chris@10 110 TZ = FMA(KP866025403, TM, TJ);
Chris@10 111 TO = FNMS(KP500000000, Tp, Tm);
Chris@10 112 Tq = Tm + Tp;
Chris@10 113 T1n = TP + TQ;
Chris@10 114 TR = TP - TQ;
Chris@10 115 T2s = Tl - Tq;
Chris@10 116 Tr = Tl + Tq;
Chris@10 117 T10 = FMA(KP866025403, TR, TO);
Chris@10 118 TS = FNMS(KP866025403, TR, TO);
Chris@10 119 T1o = FNMS(KP500000000, T1n, T1m);
Chris@10 120 T2d = T1m + T1n;
Chris@10 121 }
Chris@10 122 {
Chris@10 123 E T1O, T1l, T1P, T1q;
Chris@10 124 T1O = FNMS(KP866025403, T1k, T1j);
Chris@10 125 T1l = FMA(KP866025403, T1k, T1j);
Chris@10 126 T24 = TN - TS;
Chris@10 127 TT = TN + TS;
Chris@10 128 T1P = FNMS(KP866025403, T1p, T1o);
Chris@10 129 T1q = FMA(KP866025403, T1p, T1o);
Chris@10 130 T2e = T2c - T2d;
Chris@10 131 T2n = T2c + T2d;
Chris@10 132 T1Z = T1O + T1P;
Chris@10 133 T1Q = T1O - T1P;
Chris@10 134 T1r = T1l - T1q;
Chris@10 135 T1B = T1l + T1q;
Chris@10 136 }
Chris@10 137 }
Chris@10 138 }
Chris@10 139 {
Chris@10 140 E T19, Tb, TB, Ty, Ta, T2f, T18, T1b, TE, T1e, Te, TF;
Chris@10 141 {
Chris@10 142 E T6, T16, Tz, TA, T9, T7, T8, Tc, Td, T17;
Chris@10 143 T6 = ri[WS(is, 3)];
Chris@10 144 T7 = ri[WS(is, 8)];
Chris@10 145 T11 = TZ + T10;
Chris@10 146 T1H = TZ - T10;
Chris@10 147 T8 = ri[WS(is, 13)];
Chris@10 148 T16 = ii[WS(is, 3)];
Chris@10 149 Tz = ii[WS(is, 8)];
Chris@10 150 TA = ii[WS(is, 13)];
Chris@10 151 T9 = T7 + T8;
Chris@10 152 T19 = T8 - T7;
Chris@10 153 Tb = ri[WS(is, 12)];
Chris@10 154 TB = Tz - TA;
Chris@10 155 T17 = Tz + TA;
Chris@10 156 Ty = FNMS(KP500000000, T9, T6);
Chris@10 157 Ta = T6 + T9;
Chris@10 158 Tc = ri[WS(is, 2)];
Chris@10 159 Td = ri[WS(is, 7)];
Chris@10 160 T2f = T16 + T17;
Chris@10 161 T18 = FNMS(KP500000000, T17, T16);
Chris@10 162 T1b = ii[WS(is, 12)];
Chris@10 163 TE = ii[WS(is, 2)];
Chris@10 164 T1e = Td - Tc;
Chris@10 165 Te = Tc + Td;
Chris@10 166 TF = ii[WS(is, 7)];
Chris@10 167 }
Chris@10 168 {
Chris@10 169 E TC, TH, T1d, T2g;
Chris@10 170 {
Chris@10 171 E TD, T1c, TG, Tf;
Chris@10 172 TC = FNMS(KP866025403, TB, Ty);
Chris@10 173 TW = FMA(KP866025403, TB, Ty);
Chris@10 174 TD = FNMS(KP500000000, Te, Tb);
Chris@10 175 Tf = Tb + Te;
Chris@10 176 T1c = TE + TF;
Chris@10 177 TG = TE - TF;
Chris@10 178 T2t = Ta - Tf;
Chris@10 179 Tg = Ta + Tf;
Chris@10 180 TX = FMA(KP866025403, TG, TD);
Chris@10 181 TH = FNMS(KP866025403, TG, TD);
Chris@10 182 T1d = FNMS(KP500000000, T1c, T1b);
Chris@10 183 T2g = T1b + T1c;
Chris@10 184 }
Chris@10 185 {
Chris@10 186 E T1R, T1a, T1S, T1f;
Chris@10 187 T1R = FNMS(KP866025403, T19, T18);
Chris@10 188 T1a = FMA(KP866025403, T19, T18);
Chris@10 189 T25 = TC - TH;
Chris@10 190 TI = TC + TH;
Chris@10 191 T1S = FNMS(KP866025403, T1e, T1d);
Chris@10 192 T1f = FMA(KP866025403, T1e, T1d);
Chris@10 193 T2h = T2f - T2g;
Chris@10 194 T2m = T2f + T2g;
Chris@10 195 T1Y = T1R + T1S;
Chris@10 196 T1T = T1R - T1S;
Chris@10 197 T1g = T1a - T1f;
Chris@10 198 T1A = T1a + T1f;
Chris@10 199 }
Chris@10 200 }
Chris@10 201 }
Chris@10 202 }
Chris@10 203 {
Chris@10 204 E TY, T1G, T1M, T1L, T2a, T29, Ts, T22, T21, T20;
Chris@10 205 T2a = Tg - Tr;
Chris@10 206 Ts = Tg + Tr;
Chris@10 207 TY = TW + TX;
Chris@10 208 T1G = TW - TX;
Chris@10 209 T29 = FNMS(KP250000000, Ts, T5);
Chris@10 210 ro[0] = T5 + Ts;
Chris@10 211 {
Chris@10 212 E T2q, T2p, T2o, TU;
Chris@10 213 T2o = T2m + T2n;
Chris@10 214 T2q = T2m - T2n;
Chris@10 215 {
Chris@10 216 E T2k, T2i, T2b, T2j;
Chris@10 217 T2k = FMA(KP618033988, T2e, T2h);
Chris@10 218 T2i = FNMS(KP618033988, T2h, T2e);
Chris@10 219 T2b = FNMS(KP559016994, T2a, T29);
Chris@10 220 T2j = FMA(KP559016994, T2a, T29);
Chris@10 221 ro[WS(os, 3)] = FMA(KP951056516, T2i, T2b);
Chris@10 222 ro[WS(os, 12)] = FNMS(KP951056516, T2i, T2b);
Chris@10 223 ro[WS(os, 6)] = FMA(KP951056516, T2k, T2j);
Chris@10 224 ro[WS(os, 9)] = FNMS(KP951056516, T2k, T2j);
Chris@10 225 T2p = FNMS(KP250000000, T2o, T2l);
Chris@10 226 }
Chris@10 227 io[0] = T2l + T2o;
Chris@10 228 TU = TI + TT;
Chris@10 229 T1M = TI - TT;
Chris@10 230 {
Chris@10 231 E T2r, T2v, T2w, T2u;
Chris@10 232 T2r = FNMS(KP559016994, T2q, T2p);
Chris@10 233 T2v = FMA(KP559016994, T2q, T2p);
Chris@10 234 T2w = FMA(KP618033988, T2s, T2t);
Chris@10 235 T2u = FNMS(KP618033988, T2t, T2s);
Chris@10 236 io[WS(os, 9)] = FMA(KP951056516, T2w, T2v);
Chris@10 237 io[WS(os, 6)] = FNMS(KP951056516, T2w, T2v);
Chris@10 238 io[WS(os, 12)] = FMA(KP951056516, T2u, T2r);
Chris@10 239 io[WS(os, 3)] = FNMS(KP951056516, T2u, T2r);
Chris@10 240 T1L = FNMS(KP250000000, TU, Tx);
Chris@10 241 }
Chris@10 242 ro[WS(os, 5)] = Tx + TU;
Chris@10 243 }
Chris@10 244 T20 = T1Y + T1Z;
Chris@10 245 T22 = T1Y - T1Z;
Chris@10 246 {
Chris@10 247 E T1N, T1V, T1W, T1U;
Chris@10 248 T1N = FNMS(KP559016994, T1M, T1L);
Chris@10 249 T1V = FMA(KP559016994, T1M, T1L);
Chris@10 250 T1W = FMA(KP618033988, T1Q, T1T);
Chris@10 251 T1U = FNMS(KP618033988, T1T, T1Q);
Chris@10 252 ro[WS(os, 11)] = FMA(KP951056516, T1W, T1V);
Chris@10 253 ro[WS(os, 14)] = FNMS(KP951056516, T1W, T1V);
Chris@10 254 ro[WS(os, 8)] = FMA(KP951056516, T1U, T1N);
Chris@10 255 ro[WS(os, 2)] = FNMS(KP951056516, T1U, T1N);
Chris@10 256 T21 = FNMS(KP250000000, T20, T1X);
Chris@10 257 }
Chris@10 258 io[WS(os, 5)] = T1X + T20;
Chris@10 259 {
Chris@10 260 E T1E, T1D, T1C, T12;
Chris@10 261 T1C = T1A + T1B;
Chris@10 262 T1E = T1A - T1B;
Chris@10 263 {
Chris@10 264 E T23, T27, T28, T26;
Chris@10 265 T23 = FNMS(KP559016994, T22, T21);
Chris@10 266 T27 = FMA(KP559016994, T22, T21);
Chris@10 267 T28 = FMA(KP618033988, T24, T25);
Chris@10 268 T26 = FNMS(KP618033988, T25, T24);
Chris@10 269 io[WS(os, 14)] = FMA(KP951056516, T28, T27);
Chris@10 270 io[WS(os, 11)] = FNMS(KP951056516, T28, T27);
Chris@10 271 io[WS(os, 8)] = FNMS(KP951056516, T26, T23);
Chris@10 272 io[WS(os, 2)] = FMA(KP951056516, T26, T23);
Chris@10 273 T1D = FNMS(KP250000000, T1C, T1z);
Chris@10 274 }
Chris@10 275 io[WS(os, 10)] = T1z + T1C;
Chris@10 276 T12 = TY + T11;
Chris@10 277 T14 = TY - T11;
Chris@10 278 {
Chris@10 279 E T1F, T1J, T1K, T1I;
Chris@10 280 T1F = FMA(KP559016994, T1E, T1D);
Chris@10 281 T1J = FNMS(KP559016994, T1E, T1D);
Chris@10 282 T1K = FNMS(KP618033988, T1G, T1H);
Chris@10 283 T1I = FMA(KP618033988, T1H, T1G);
Chris@10 284 io[WS(os, 13)] = FNMS(KP951056516, T1K, T1J);
Chris@10 285 io[WS(os, 7)] = FMA(KP951056516, T1K, T1J);
Chris@10 286 io[WS(os, 4)] = FMA(KP951056516, T1I, T1F);
Chris@10 287 io[WS(os, 1)] = FNMS(KP951056516, T1I, T1F);
Chris@10 288 T13 = FNMS(KP250000000, T12, TV);
Chris@10 289 }
Chris@10 290 ro[WS(os, 10)] = TV + T12;
Chris@10 291 }
Chris@10 292 }
Chris@10 293 }
Chris@10 294 {
Chris@10 295 E T1t, T15, T1s, T1u;
Chris@10 296 T1t = FNMS(KP559016994, T14, T13);
Chris@10 297 T15 = FMA(KP559016994, T14, T13);
Chris@10 298 T1s = FMA(KP618033988, T1r, T1g);
Chris@10 299 T1u = FNMS(KP618033988, T1g, T1r);
Chris@10 300 ro[WS(os, 13)] = FMA(KP951056516, T1u, T1t);
Chris@10 301 ro[WS(os, 7)] = FNMS(KP951056516, T1u, T1t);
Chris@10 302 ro[WS(os, 1)] = FMA(KP951056516, T1s, T15);
Chris@10 303 ro[WS(os, 4)] = FNMS(KP951056516, T1s, T15);
Chris@10 304 }
Chris@10 305 }
Chris@10 306 }
Chris@10 307 }
Chris@10 308
Chris@10 309 static const kdft_desc desc = { 15, "n1_15", {72, 0, 84, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 310
Chris@10 311 void X(codelet_n1_15) (planner *p) {
Chris@10 312 X(kdft_register) (p, n1_15, &desc);
Chris@10 313 }
Chris@10 314
Chris@10 315 #else /* HAVE_FMA */
Chris@10 316
Chris@10 317 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include n.h */
Chris@10 318
Chris@10 319 /*
Chris@10 320 * This function contains 156 FP additions, 56 FP multiplications,
Chris@10 321 * (or, 128 additions, 28 multiplications, 28 fused multiply/add),
Chris@10 322 * 69 stack variables, 6 constants, and 60 memory accesses
Chris@10 323 */
Chris@10 324 #include "n.h"
Chris@10 325
Chris@10 326 static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 327 {
Chris@10 328 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 329 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 330 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 331 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 332 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 333 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 334 {
Chris@10 335 INT i;
Chris@10 336 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
Chris@10 337 E T5, T2l, Tx, TV, T1C, T20, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
Chris@10 338 E T1O, T1P, T22, T1l, T1q, T1w, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
Chris@10 339 E T2f, T2g, T2m, T1R, T1S, T21, T1a, T1f, T1v, TW, TX, TY;
Chris@10 340 {
Chris@10 341 E T1, T1z, T4, T1y, Tw, T1A, Tt, T1B;
Chris@10 342 T1 = ri[0];
Chris@10 343 T1z = ii[0];
Chris@10 344 {
Chris@10 345 E T2, T3, Tu, Tv;
Chris@10 346 T2 = ri[WS(is, 5)];
Chris@10 347 T3 = ri[WS(is, 10)];
Chris@10 348 T4 = T2 + T3;
Chris@10 349 T1y = KP866025403 * (T3 - T2);
Chris@10 350 Tu = ii[WS(is, 5)];
Chris@10 351 Tv = ii[WS(is, 10)];
Chris@10 352 Tw = KP866025403 * (Tu - Tv);
Chris@10 353 T1A = Tu + Tv;
Chris@10 354 }
Chris@10 355 T5 = T1 + T4;
Chris@10 356 T2l = T1z + T1A;
Chris@10 357 Tt = FNMS(KP500000000, T4, T1);
Chris@10 358 Tx = Tt - Tw;
Chris@10 359 TV = Tt + Tw;
Chris@10 360 T1B = FNMS(KP500000000, T1A, T1z);
Chris@10 361 T1C = T1y + T1B;
Chris@10 362 T20 = T1B - T1y;
Chris@10 363 }
Chris@10 364 {
Chris@10 365 E Th, Tk, TJ, T1h, T1i, T1j, TM, T1k, Tm, Tp, TO, T1m, T1n, T1o, TR;
Chris@10 366 E T1p;
Chris@10 367 {
Chris@10 368 E Ti, Tj, TK, TL;
Chris@10 369 Th = ri[WS(is, 6)];
Chris@10 370 Ti = ri[WS(is, 11)];
Chris@10 371 Tj = ri[WS(is, 1)];
Chris@10 372 Tk = Ti + Tj;
Chris@10 373 TJ = FNMS(KP500000000, Tk, Th);
Chris@10 374 T1h = KP866025403 * (Tj - Ti);
Chris@10 375 T1i = ii[WS(is, 6)];
Chris@10 376 TK = ii[WS(is, 11)];
Chris@10 377 TL = ii[WS(is, 1)];
Chris@10 378 T1j = TK + TL;
Chris@10 379 TM = KP866025403 * (TK - TL);
Chris@10 380 T1k = FNMS(KP500000000, T1j, T1i);
Chris@10 381 }
Chris@10 382 {
Chris@10 383 E Tn, To, TP, TQ;
Chris@10 384 Tm = ri[WS(is, 9)];
Chris@10 385 Tn = ri[WS(is, 14)];
Chris@10 386 To = ri[WS(is, 4)];
Chris@10 387 Tp = Tn + To;
Chris@10 388 TO = FNMS(KP500000000, Tp, Tm);
Chris@10 389 T1m = KP866025403 * (To - Tn);
Chris@10 390 T1n = ii[WS(is, 9)];
Chris@10 391 TP = ii[WS(is, 14)];
Chris@10 392 TQ = ii[WS(is, 4)];
Chris@10 393 T1o = TP + TQ;
Chris@10 394 TR = KP866025403 * (TP - TQ);
Chris@10 395 T1p = FNMS(KP500000000, T1o, T1n);
Chris@10 396 }
Chris@10 397 Tl = Th + Tk;
Chris@10 398 Tq = Tm + Tp;
Chris@10 399 Tr = Tl + Tq;
Chris@10 400 TN = TJ - TM;
Chris@10 401 TS = TO - TR;
Chris@10 402 TT = TN + TS;
Chris@10 403 T2c = T1i + T1j;
Chris@10 404 T2d = T1n + T1o;
Chris@10 405 T2n = T2c + T2d;
Chris@10 406 T1O = T1k - T1h;
Chris@10 407 T1P = T1p - T1m;
Chris@10 408 T22 = T1O + T1P;
Chris@10 409 T1l = T1h + T1k;
Chris@10 410 T1q = T1m + T1p;
Chris@10 411 T1w = T1l + T1q;
Chris@10 412 TZ = TJ + TM;
Chris@10 413 T10 = TO + TR;
Chris@10 414 T11 = TZ + T10;
Chris@10 415 }
Chris@10 416 {
Chris@10 417 E T6, T9, Ty, T16, T17, T18, TB, T19, Tb, Te, TD, T1b, T1c, T1d, TG;
Chris@10 418 E T1e;
Chris@10 419 {
Chris@10 420 E T7, T8, Tz, TA;
Chris@10 421 T6 = ri[WS(is, 3)];
Chris@10 422 T7 = ri[WS(is, 8)];
Chris@10 423 T8 = ri[WS(is, 13)];
Chris@10 424 T9 = T7 + T8;
Chris@10 425 Ty = FNMS(KP500000000, T9, T6);
Chris@10 426 T16 = KP866025403 * (T8 - T7);
Chris@10 427 T17 = ii[WS(is, 3)];
Chris@10 428 Tz = ii[WS(is, 8)];
Chris@10 429 TA = ii[WS(is, 13)];
Chris@10 430 T18 = Tz + TA;
Chris@10 431 TB = KP866025403 * (Tz - TA);
Chris@10 432 T19 = FNMS(KP500000000, T18, T17);
Chris@10 433 }
Chris@10 434 {
Chris@10 435 E Tc, Td, TE, TF;
Chris@10 436 Tb = ri[WS(is, 12)];
Chris@10 437 Tc = ri[WS(is, 2)];
Chris@10 438 Td = ri[WS(is, 7)];
Chris@10 439 Te = Tc + Td;
Chris@10 440 TD = FNMS(KP500000000, Te, Tb);
Chris@10 441 T1b = KP866025403 * (Td - Tc);
Chris@10 442 T1c = ii[WS(is, 12)];
Chris@10 443 TE = ii[WS(is, 2)];
Chris@10 444 TF = ii[WS(is, 7)];
Chris@10 445 T1d = TE + TF;
Chris@10 446 TG = KP866025403 * (TE - TF);
Chris@10 447 T1e = FNMS(KP500000000, T1d, T1c);
Chris@10 448 }
Chris@10 449 Ta = T6 + T9;
Chris@10 450 Tf = Tb + Te;
Chris@10 451 Tg = Ta + Tf;
Chris@10 452 TC = Ty - TB;
Chris@10 453 TH = TD - TG;
Chris@10 454 TI = TC + TH;
Chris@10 455 T2f = T17 + T18;
Chris@10 456 T2g = T1c + T1d;
Chris@10 457 T2m = T2f + T2g;
Chris@10 458 T1R = T19 - T16;
Chris@10 459 T1S = T1e - T1b;
Chris@10 460 T21 = T1R + T1S;
Chris@10 461 T1a = T16 + T19;
Chris@10 462 T1f = T1b + T1e;
Chris@10 463 T1v = T1a + T1f;
Chris@10 464 TW = Ty + TB;
Chris@10 465 TX = TD + TG;
Chris@10 466 TY = TW + TX;
Chris@10 467 }
Chris@10 468 {
Chris@10 469 E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
Chris@10 470 T2a = KP559016994 * (Tg - Tr);
Chris@10 471 Ts = Tg + Tr;
Chris@10 472 T29 = FNMS(KP250000000, Ts, T5);
Chris@10 473 T2e = T2c - T2d;
Chris@10 474 T2h = T2f - T2g;
Chris@10 475 T2i = FNMS(KP587785252, T2h, KP951056516 * T2e);
Chris@10 476 T2k = FMA(KP951056516, T2h, KP587785252 * T2e);
Chris@10 477 ro[0] = T5 + Ts;
Chris@10 478 T2j = T2a + T29;
Chris@10 479 ro[WS(os, 9)] = T2j - T2k;
Chris@10 480 ro[WS(os, 6)] = T2j + T2k;
Chris@10 481 T2b = T29 - T2a;
Chris@10 482 ro[WS(os, 12)] = T2b - T2i;
Chris@10 483 ro[WS(os, 3)] = T2b + T2i;
Chris@10 484 }
Chris@10 485 {
Chris@10 486 E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
Chris@10 487 T2q = KP559016994 * (T2m - T2n);
Chris@10 488 T2o = T2m + T2n;
Chris@10 489 T2p = FNMS(KP250000000, T2o, T2l);
Chris@10 490 T2s = Tl - Tq;
Chris@10 491 T2t = Ta - Tf;
Chris@10 492 T2u = FNMS(KP587785252, T2t, KP951056516 * T2s);
Chris@10 493 T2w = FMA(KP951056516, T2t, KP587785252 * T2s);
Chris@10 494 io[0] = T2l + T2o;
Chris@10 495 T2v = T2q + T2p;
Chris@10 496 io[WS(os, 6)] = T2v - T2w;
Chris@10 497 io[WS(os, 9)] = T2w + T2v;
Chris@10 498 T2r = T2p - T2q;
Chris@10 499 io[WS(os, 3)] = T2r - T2u;
Chris@10 500 io[WS(os, 12)] = T2u + T2r;
Chris@10 501 }
Chris@10 502 {
Chris@10 503 E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
Chris@10 504 T1M = KP559016994 * (TI - TT);
Chris@10 505 TU = TI + TT;
Chris@10 506 T1L = FNMS(KP250000000, TU, Tx);
Chris@10 507 T1Q = T1O - T1P;
Chris@10 508 T1T = T1R - T1S;
Chris@10 509 T1U = FNMS(KP587785252, T1T, KP951056516 * T1Q);
Chris@10 510 T1W = FMA(KP951056516, T1T, KP587785252 * T1Q);
Chris@10 511 ro[WS(os, 5)] = Tx + TU;
Chris@10 512 T1V = T1M + T1L;
Chris@10 513 ro[WS(os, 14)] = T1V - T1W;
Chris@10 514 ro[WS(os, 11)] = T1V + T1W;
Chris@10 515 T1N = T1L - T1M;
Chris@10 516 ro[WS(os, 2)] = T1N - T1U;
Chris@10 517 ro[WS(os, 8)] = T1N + T1U;
Chris@10 518 }
Chris@10 519 {
Chris@10 520 E T25, T23, T24, T1Z, T28, T1X, T1Y, T27, T26;
Chris@10 521 T25 = KP559016994 * (T21 - T22);
Chris@10 522 T23 = T21 + T22;
Chris@10 523 T24 = FNMS(KP250000000, T23, T20);
Chris@10 524 T1X = TN - TS;
Chris@10 525 T1Y = TC - TH;
Chris@10 526 T1Z = FNMS(KP587785252, T1Y, KP951056516 * T1X);
Chris@10 527 T28 = FMA(KP951056516, T1Y, KP587785252 * T1X);
Chris@10 528 io[WS(os, 5)] = T20 + T23;
Chris@10 529 T27 = T25 + T24;
Chris@10 530 io[WS(os, 11)] = T27 - T28;
Chris@10 531 io[WS(os, 14)] = T28 + T27;
Chris@10 532 T26 = T24 - T25;
Chris@10 533 io[WS(os, 2)] = T1Z + T26;
Chris@10 534 io[WS(os, 8)] = T26 - T1Z;
Chris@10 535 }
Chris@10 536 {
Chris@10 537 E T1x, T1D, T1E, T1I, T1J, T1G, T1H, T1K, T1F;
Chris@10 538 T1x = KP559016994 * (T1v - T1w);
Chris@10 539 T1D = T1v + T1w;
Chris@10 540 T1E = FNMS(KP250000000, T1D, T1C);
Chris@10 541 T1G = TW - TX;
Chris@10 542 T1H = TZ - T10;
Chris@10 543 T1I = FMA(KP951056516, T1G, KP587785252 * T1H);
Chris@10 544 T1J = FNMS(KP587785252, T1G, KP951056516 * T1H);
Chris@10 545 io[WS(os, 10)] = T1C + T1D;
Chris@10 546 T1K = T1E - T1x;
Chris@10 547 io[WS(os, 7)] = T1J + T1K;
Chris@10 548 io[WS(os, 13)] = T1K - T1J;
Chris@10 549 T1F = T1x + T1E;
Chris@10 550 io[WS(os, 1)] = T1F - T1I;
Chris@10 551 io[WS(os, 4)] = T1I + T1F;
Chris@10 552 }
Chris@10 553 {
Chris@10 554 E T13, T12, T14, T1s, T1u, T1g, T1r, T1t, T15;
Chris@10 555 T13 = KP559016994 * (TY - T11);
Chris@10 556 T12 = TY + T11;
Chris@10 557 T14 = FNMS(KP250000000, T12, TV);
Chris@10 558 T1g = T1a - T1f;
Chris@10 559 T1r = T1l - T1q;
Chris@10 560 T1s = FMA(KP951056516, T1g, KP587785252 * T1r);
Chris@10 561 T1u = FNMS(KP587785252, T1g, KP951056516 * T1r);
Chris@10 562 ro[WS(os, 10)] = TV + T12;
Chris@10 563 T1t = T14 - T13;
Chris@10 564 ro[WS(os, 7)] = T1t - T1u;
Chris@10 565 ro[WS(os, 13)] = T1t + T1u;
Chris@10 566 T15 = T13 + T14;
Chris@10 567 ro[WS(os, 4)] = T15 - T1s;
Chris@10 568 ro[WS(os, 1)] = T15 + T1s;
Chris@10 569 }
Chris@10 570 }
Chris@10 571 }
Chris@10 572 }
Chris@10 573
Chris@10 574 static const kdft_desc desc = { 15, "n1_15", {128, 28, 28, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 575
Chris@10 576 void X(codelet_n1_15) (planner *p) {
Chris@10 577 X(kdft_register) (p, n1_15, &desc);
Chris@10 578 }
Chris@10 579
Chris@10 580 #endif /* HAVE_FMA */