annotate src/fftw-3.3.3/dft/scalar/codelets/n1_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:44 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include n.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 144 FP additions, 40 FP multiplications,
Chris@10 32 * (or, 104 additions, 0 multiplications, 40 fused multiply/add),
Chris@10 33 * 82 stack variables, 3 constants, and 64 memory accesses
Chris@10 34 */
Chris@10 35 #include "n.h"
Chris@10 36
Chris@10 37 static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 42 {
Chris@10 43 INT i;
Chris@10 44 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@10 45 E T1z, T1L, T1M, T1N, T1P, T1J, T1K, T1G, T1O, T1Q;
Chris@10 46 {
Chris@10 47 E T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt, T2h, T22, T1D;
Chris@10 48 E T1g, T1n, TQ, Te, T26, TT, T1m, TJ, T1S, Tj, T11, Ti, T1V, TZ, Tk;
Chris@10 49 E T12, T13;
Chris@10 50 {
Chris@10 51 E Tq, T1c, Tp, T20, T1a, Tr, T1d, T1e;
Chris@10 52 {
Chris@10 53 E T4, TL, T3, T1k, Ty, T5, Tz, TA;
Chris@10 54 {
Chris@10 55 E T1, T2, Tw, Tx;
Chris@10 56 T1 = ri[0];
Chris@10 57 T2 = ri[WS(is, 8)];
Chris@10 58 Tw = ii[0];
Chris@10 59 Tx = ii[WS(is, 8)];
Chris@10 60 T4 = ri[WS(is, 4)];
Chris@10 61 TL = T1 - T2;
Chris@10 62 T3 = T1 + T2;
Chris@10 63 T1k = Tw - Tx;
Chris@10 64 Ty = Tw + Tx;
Chris@10 65 T5 = ri[WS(is, 12)];
Chris@10 66 Tz = ii[WS(is, 4)];
Chris@10 67 TA = ii[WS(is, 12)];
Chris@10 68 }
Chris@10 69 {
Chris@10 70 E Tn, To, T18, T19;
Chris@10 71 Tn = ri[WS(is, 15)];
Chris@10 72 {
Chris@10 73 E T1j, T6, TM, TB;
Chris@10 74 T1j = T4 - T5;
Chris@10 75 T6 = T4 + T5;
Chris@10 76 TM = Tz - TA;
Chris@10 77 TB = Tz + TA;
Chris@10 78 T1l = T1j + T1k;
Chris@10 79 T1H = T1k - T1j;
Chris@10 80 T1R = T3 - T6;
Chris@10 81 T7 = T3 + T6;
Chris@10 82 T1x = TL + TM;
Chris@10 83 TN = TL - TM;
Chris@10 84 TC = Ty + TB;
Chris@10 85 T25 = Ty - TB;
Chris@10 86 To = ri[WS(is, 7)];
Chris@10 87 }
Chris@10 88 T18 = ii[WS(is, 15)];
Chris@10 89 T19 = ii[WS(is, 7)];
Chris@10 90 Tq = ri[WS(is, 3)];
Chris@10 91 T1c = Tn - To;
Chris@10 92 Tp = Tn + To;
Chris@10 93 T20 = T18 + T19;
Chris@10 94 T1a = T18 - T19;
Chris@10 95 Tr = ri[WS(is, 11)];
Chris@10 96 T1d = ii[WS(is, 3)];
Chris@10 97 T1e = ii[WS(is, 11)];
Chris@10 98 }
Chris@10 99 }
Chris@10 100 {
Chris@10 101 E Tb, TP, Ta, TO, TF, Tc, TG, TH;
Chris@10 102 {
Chris@10 103 E T8, T9, TD, TE;
Chris@10 104 T8 = ri[WS(is, 2)];
Chris@10 105 {
Chris@10 106 E T17, Ts, T21, T1f;
Chris@10 107 T17 = Tq - Tr;
Chris@10 108 Ts = Tq + Tr;
Chris@10 109 T21 = T1d + T1e;
Chris@10 110 T1f = T1d - T1e;
Chris@10 111 T1E = T1a - T17;
Chris@10 112 T1b = T17 + T1a;
Chris@10 113 T1Z = Tp - Ts;
Chris@10 114 Tt = Tp + Ts;
Chris@10 115 T2h = T20 + T21;
Chris@10 116 T22 = T20 - T21;
Chris@10 117 T1D = T1c + T1f;
Chris@10 118 T1g = T1c - T1f;
Chris@10 119 T9 = ri[WS(is, 10)];
Chris@10 120 }
Chris@10 121 TD = ii[WS(is, 2)];
Chris@10 122 TE = ii[WS(is, 10)];
Chris@10 123 Tb = ri[WS(is, 14)];
Chris@10 124 TP = T8 - T9;
Chris@10 125 Ta = T8 + T9;
Chris@10 126 TO = TD - TE;
Chris@10 127 TF = TD + TE;
Chris@10 128 Tc = ri[WS(is, 6)];
Chris@10 129 TG = ii[WS(is, 14)];
Chris@10 130 TH = ii[WS(is, 6)];
Chris@10 131 }
Chris@10 132 {
Chris@10 133 E TR, Td, TS, TI;
Chris@10 134 T1n = TP + TO;
Chris@10 135 TQ = TO - TP;
Chris@10 136 TR = Tb - Tc;
Chris@10 137 Td = Tb + Tc;
Chris@10 138 TS = TG - TH;
Chris@10 139 TI = TG + TH;
Chris@10 140 Te = Ta + Td;
Chris@10 141 T26 = Td - Ta;
Chris@10 142 TT = TR + TS;
Chris@10 143 T1m = TR - TS;
Chris@10 144 TJ = TF + TI;
Chris@10 145 T1S = TF - TI;
Chris@10 146 }
Chris@10 147 }
Chris@10 148 {
Chris@10 149 E Tg, Th, TX, TY;
Chris@10 150 Tg = ri[WS(is, 1)];
Chris@10 151 Th = ri[WS(is, 9)];
Chris@10 152 TX = ii[WS(is, 1)];
Chris@10 153 TY = ii[WS(is, 9)];
Chris@10 154 Tj = ri[WS(is, 5)];
Chris@10 155 T11 = Tg - Th;
Chris@10 156 Ti = Tg + Th;
Chris@10 157 T1V = TX + TY;
Chris@10 158 TZ = TX - TY;
Chris@10 159 Tk = ri[WS(is, 13)];
Chris@10 160 T12 = ii[WS(is, 5)];
Chris@10 161 T13 = ii[WS(is, 13)];
Chris@10 162 }
Chris@10 163 }
Chris@10 164 {
Chris@10 165 E T2f, T1B, T10, T1U, T1X, T1A, T15, Tv, TK, T2i;
Chris@10 166 {
Chris@10 167 E Tf, Tu, T2j, T2k, T2g;
Chris@10 168 T2f = T7 - Te;
Chris@10 169 Tf = T7 + Te;
Chris@10 170 {
Chris@10 171 E TW, Tl, T1W, T14, Tm;
Chris@10 172 TW = Tj - Tk;
Chris@10 173 Tl = Tj + Tk;
Chris@10 174 T1W = T12 + T13;
Chris@10 175 T14 = T12 - T13;
Chris@10 176 T1B = TZ - TW;
Chris@10 177 T10 = TW + TZ;
Chris@10 178 T1U = Ti - Tl;
Chris@10 179 Tm = Ti + Tl;
Chris@10 180 T2g = T1V + T1W;
Chris@10 181 T1X = T1V - T1W;
Chris@10 182 T1A = T11 + T14;
Chris@10 183 T15 = T11 - T14;
Chris@10 184 Tu = Tm + Tt;
Chris@10 185 Tv = Tt - Tm;
Chris@10 186 }
Chris@10 187 TK = TC - TJ;
Chris@10 188 T2j = TC + TJ;
Chris@10 189 T2k = T2g + T2h;
Chris@10 190 T2i = T2g - T2h;
Chris@10 191 ro[0] = Tf + Tu;
Chris@10 192 ro[WS(os, 8)] = Tf - Tu;
Chris@10 193 io[0] = T2j + T2k;
Chris@10 194 io[WS(os, 8)] = T2j - T2k;
Chris@10 195 }
Chris@10 196 {
Chris@10 197 E T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y, T23;
Chris@10 198 T29 = T1R - T1S;
Chris@10 199 T1T = T1R + T1S;
Chris@10 200 io[WS(os, 12)] = TK - Tv;
Chris@10 201 io[WS(os, 4)] = Tv + TK;
Chris@10 202 ro[WS(os, 4)] = T2f + T2i;
Chris@10 203 ro[WS(os, 12)] = T2f - T2i;
Chris@10 204 T27 = T25 - T26;
Chris@10 205 T2d = T26 + T25;
Chris@10 206 T2a = T1X - T1U;
Chris@10 207 T1Y = T1U + T1X;
Chris@10 208 T23 = T1Z - T22;
Chris@10 209 T2b = T1Z + T22;
Chris@10 210 T28 = T23 - T1Y;
Chris@10 211 T24 = T1Y + T23;
Chris@10 212 {
Chris@10 213 E T1I, TV, T1v, T1y, T1t, T1s, T1r, T1p, T1q, T1i;
Chris@10 214 {
Chris@10 215 E T1o, T2e, T2c, TU, T16, T1h;
Chris@10 216 T1I = TQ + TT;
Chris@10 217 TU = TQ - TT;
Chris@10 218 io[WS(os, 14)] = FNMS(KP707106781, T28, T27);
Chris@10 219 io[WS(os, 6)] = FMA(KP707106781, T28, T27);
Chris@10 220 ro[WS(os, 2)] = FMA(KP707106781, T24, T1T);
Chris@10 221 ro[WS(os, 10)] = FNMS(KP707106781, T24, T1T);
Chris@10 222 T2e = T2a + T2b;
Chris@10 223 T2c = T2a - T2b;
Chris@10 224 TV = FMA(KP707106781, TU, TN);
Chris@10 225 T1v = FNMS(KP707106781, TU, TN);
Chris@10 226 io[WS(os, 10)] = FNMS(KP707106781, T2e, T2d);
Chris@10 227 io[WS(os, 2)] = FMA(KP707106781, T2e, T2d);
Chris@10 228 ro[WS(os, 6)] = FMA(KP707106781, T2c, T29);
Chris@10 229 ro[WS(os, 14)] = FNMS(KP707106781, T2c, T29);
Chris@10 230 T1o = T1m - T1n;
Chris@10 231 T1y = T1n + T1m;
Chris@10 232 T1t = FNMS(KP414213562, T10, T15);
Chris@10 233 T16 = FMA(KP414213562, T15, T10);
Chris@10 234 T1h = FNMS(KP414213562, T1g, T1b);
Chris@10 235 T1s = FMA(KP414213562, T1b, T1g);
Chris@10 236 T1r = FMA(KP707106781, T1o, T1l);
Chris@10 237 T1p = FNMS(KP707106781, T1o, T1l);
Chris@10 238 T1q = T16 + T1h;
Chris@10 239 T1i = T16 - T1h;
Chris@10 240 }
Chris@10 241 {
Chris@10 242 E T1w, T1u, T1C, T1F;
Chris@10 243 io[WS(os, 15)] = FMA(KP923879532, T1q, T1p);
Chris@10 244 io[WS(os, 7)] = FNMS(KP923879532, T1q, T1p);
Chris@10 245 ro[WS(os, 3)] = FMA(KP923879532, T1i, TV);
Chris@10 246 ro[WS(os, 11)] = FNMS(KP923879532, T1i, TV);
Chris@10 247 T1w = T1t + T1s;
Chris@10 248 T1u = T1s - T1t;
Chris@10 249 T1z = FMA(KP707106781, T1y, T1x);
Chris@10 250 T1L = FNMS(KP707106781, T1y, T1x);
Chris@10 251 ro[WS(os, 15)] = FMA(KP923879532, T1w, T1v);
Chris@10 252 ro[WS(os, 7)] = FNMS(KP923879532, T1w, T1v);
Chris@10 253 io[WS(os, 3)] = FMA(KP923879532, T1u, T1r);
Chris@10 254 io[WS(os, 11)] = FNMS(KP923879532, T1u, T1r);
Chris@10 255 T1M = FNMS(KP414213562, T1A, T1B);
Chris@10 256 T1C = FMA(KP414213562, T1B, T1A);
Chris@10 257 T1F = FNMS(KP414213562, T1E, T1D);
Chris@10 258 T1N = FMA(KP414213562, T1D, T1E);
Chris@10 259 T1P = FMA(KP707106781, T1I, T1H);
Chris@10 260 T1J = FNMS(KP707106781, T1I, T1H);
Chris@10 261 T1K = T1F - T1C;
Chris@10 262 T1G = T1C + T1F;
Chris@10 263 }
Chris@10 264 }
Chris@10 265 }
Chris@10 266 }
Chris@10 267 }
Chris@10 268 io[WS(os, 5)] = FMA(KP923879532, T1K, T1J);
Chris@10 269 io[WS(os, 13)] = FNMS(KP923879532, T1K, T1J);
Chris@10 270 ro[WS(os, 1)] = FMA(KP923879532, T1G, T1z);
Chris@10 271 ro[WS(os, 9)] = FNMS(KP923879532, T1G, T1z);
Chris@10 272 T1O = T1M - T1N;
Chris@10 273 T1Q = T1M + T1N;
Chris@10 274 io[WS(os, 1)] = FMA(KP923879532, T1Q, T1P);
Chris@10 275 io[WS(os, 9)] = FNMS(KP923879532, T1Q, T1P);
Chris@10 276 ro[WS(os, 5)] = FMA(KP923879532, T1O, T1L);
Chris@10 277 ro[WS(os, 13)] = FNMS(KP923879532, T1O, T1L);
Chris@10 278 }
Chris@10 279 }
Chris@10 280 }
Chris@10 281
Chris@10 282 static const kdft_desc desc = { 16, "n1_16", {104, 0, 40, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 283
Chris@10 284 void X(codelet_n1_16) (planner *p) {
Chris@10 285 X(kdft_register) (p, n1_16, &desc);
Chris@10 286 }
Chris@10 287
Chris@10 288 #else /* HAVE_FMA */
Chris@10 289
Chris@10 290 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include n.h */
Chris@10 291
Chris@10 292 /*
Chris@10 293 * This function contains 144 FP additions, 24 FP multiplications,
Chris@10 294 * (or, 136 additions, 16 multiplications, 8 fused multiply/add),
Chris@10 295 * 50 stack variables, 3 constants, and 64 memory accesses
Chris@10 296 */
Chris@10 297 #include "n.h"
Chris@10 298
Chris@10 299 static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 300 {
Chris@10 301 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 302 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 303 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 304 {
Chris@10 305 INT i;
Chris@10 306 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@10 307 E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
Chris@10 308 E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
Chris@10 309 E T1U, T1A;
Chris@10 310 {
Chris@10 311 E T3, TL, Ty, T1k, T6, T1j, TB, TM;
Chris@10 312 {
Chris@10 313 E T1, T2, Tw, Tx;
Chris@10 314 T1 = ri[0];
Chris@10 315 T2 = ri[WS(is, 8)];
Chris@10 316 T3 = T1 + T2;
Chris@10 317 TL = T1 - T2;
Chris@10 318 Tw = ii[0];
Chris@10 319 Tx = ii[WS(is, 8)];
Chris@10 320 Ty = Tw + Tx;
Chris@10 321 T1k = Tw - Tx;
Chris@10 322 }
Chris@10 323 {
Chris@10 324 E T4, T5, Tz, TA;
Chris@10 325 T4 = ri[WS(is, 4)];
Chris@10 326 T5 = ri[WS(is, 12)];
Chris@10 327 T6 = T4 + T5;
Chris@10 328 T1j = T4 - T5;
Chris@10 329 Tz = ii[WS(is, 4)];
Chris@10 330 TA = ii[WS(is, 12)];
Chris@10 331 TB = Tz + TA;
Chris@10 332 TM = Tz - TA;
Chris@10 333 }
Chris@10 334 T7 = T3 + T6;
Chris@10 335 T1R = T3 - T6;
Chris@10 336 T25 = Ty - TB;
Chris@10 337 TC = Ty + TB;
Chris@10 338 TN = TL - TM;
Chris@10 339 T1x = TL + TM;
Chris@10 340 T1H = T1k - T1j;
Chris@10 341 T1l = T1j + T1k;
Chris@10 342 }
Chris@10 343 {
Chris@10 344 E Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
Chris@10 345 {
Chris@10 346 E Tn, To, T1d, T1e;
Chris@10 347 Tn = ri[WS(is, 15)];
Chris@10 348 To = ri[WS(is, 7)];
Chris@10 349 Tp = Tn + To;
Chris@10 350 T17 = Tn - To;
Chris@10 351 T1d = ii[WS(is, 15)];
Chris@10 352 T1e = ii[WS(is, 7)];
Chris@10 353 T1f = T1d - T1e;
Chris@10 354 T20 = T1d + T1e;
Chris@10 355 }
Chris@10 356 {
Chris@10 357 E Tq, Tr, T18, T19;
Chris@10 358 Tq = ri[WS(is, 3)];
Chris@10 359 Tr = ri[WS(is, 11)];
Chris@10 360 Ts = Tq + Tr;
Chris@10 361 T1c = Tq - Tr;
Chris@10 362 T18 = ii[WS(is, 3)];
Chris@10 363 T19 = ii[WS(is, 11)];
Chris@10 364 T1a = T18 - T19;
Chris@10 365 T21 = T18 + T19;
Chris@10 366 }
Chris@10 367 Tt = Tp + Ts;
Chris@10 368 T22 = T20 - T21;
Chris@10 369 T2h = T20 + T21;
Chris@10 370 T1b = T17 - T1a;
Chris@10 371 T1g = T1c + T1f;
Chris@10 372 T1E = T1f - T1c;
Chris@10 373 T1Z = Tp - Ts;
Chris@10 374 T1D = T17 + T1a;
Chris@10 375 }
Chris@10 376 {
Chris@10 377 E Ta, TP, TF, TO, Td, TR, TI, TS;
Chris@10 378 {
Chris@10 379 E T8, T9, TD, TE;
Chris@10 380 T8 = ri[WS(is, 2)];
Chris@10 381 T9 = ri[WS(is, 10)];
Chris@10 382 Ta = T8 + T9;
Chris@10 383 TP = T8 - T9;
Chris@10 384 TD = ii[WS(is, 2)];
Chris@10 385 TE = ii[WS(is, 10)];
Chris@10 386 TF = TD + TE;
Chris@10 387 TO = TD - TE;
Chris@10 388 }
Chris@10 389 {
Chris@10 390 E Tb, Tc, TG, TH;
Chris@10 391 Tb = ri[WS(is, 14)];
Chris@10 392 Tc = ri[WS(is, 6)];
Chris@10 393 Td = Tb + Tc;
Chris@10 394 TR = Tb - Tc;
Chris@10 395 TG = ii[WS(is, 14)];
Chris@10 396 TH = ii[WS(is, 6)];
Chris@10 397 TI = TG + TH;
Chris@10 398 TS = TG - TH;
Chris@10 399 }
Chris@10 400 Te = Ta + Td;
Chris@10 401 T1S = TF - TI;
Chris@10 402 T26 = Td - Ta;
Chris@10 403 TJ = TF + TI;
Chris@10 404 TQ = TO - TP;
Chris@10 405 T1m = TR - TS;
Chris@10 406 T1n = TP + TO;
Chris@10 407 TT = TR + TS;
Chris@10 408 }
Chris@10 409 {
Chris@10 410 E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
Chris@10 411 {
Chris@10 412 E Tg, Th, TX, TY;
Chris@10 413 Tg = ri[WS(is, 1)];
Chris@10 414 Th = ri[WS(is, 9)];
Chris@10 415 Ti = Tg + Th;
Chris@10 416 T11 = Tg - Th;
Chris@10 417 TX = ii[WS(is, 1)];
Chris@10 418 TY = ii[WS(is, 9)];
Chris@10 419 TZ = TX - TY;
Chris@10 420 T1V = TX + TY;
Chris@10 421 }
Chris@10 422 {
Chris@10 423 E Tj, Tk, T12, T13;
Chris@10 424 Tj = ri[WS(is, 5)];
Chris@10 425 Tk = ri[WS(is, 13)];
Chris@10 426 Tl = Tj + Tk;
Chris@10 427 TW = Tj - Tk;
Chris@10 428 T12 = ii[WS(is, 5)];
Chris@10 429 T13 = ii[WS(is, 13)];
Chris@10 430 T14 = T12 - T13;
Chris@10 431 T1W = T12 + T13;
Chris@10 432 }
Chris@10 433 Tm = Ti + Tl;
Chris@10 434 T1X = T1V - T1W;
Chris@10 435 T2g = T1V + T1W;
Chris@10 436 T10 = TW + TZ;
Chris@10 437 T15 = T11 - T14;
Chris@10 438 T1B = T11 + T14;
Chris@10 439 T1U = Ti - Tl;
Chris@10 440 T1A = TZ - TW;
Chris@10 441 }
Chris@10 442 {
Chris@10 443 E Tf, Tu, T2j, T2k;
Chris@10 444 Tf = T7 + Te;
Chris@10 445 Tu = Tm + Tt;
Chris@10 446 ro[WS(os, 8)] = Tf - Tu;
Chris@10 447 ro[0] = Tf + Tu;
Chris@10 448 T2j = TC + TJ;
Chris@10 449 T2k = T2g + T2h;
Chris@10 450 io[WS(os, 8)] = T2j - T2k;
Chris@10 451 io[0] = T2j + T2k;
Chris@10 452 }
Chris@10 453 {
Chris@10 454 E Tv, TK, T2f, T2i;
Chris@10 455 Tv = Tt - Tm;
Chris@10 456 TK = TC - TJ;
Chris@10 457 io[WS(os, 4)] = Tv + TK;
Chris@10 458 io[WS(os, 12)] = TK - Tv;
Chris@10 459 T2f = T7 - Te;
Chris@10 460 T2i = T2g - T2h;
Chris@10 461 ro[WS(os, 12)] = T2f - T2i;
Chris@10 462 ro[WS(os, 4)] = T2f + T2i;
Chris@10 463 }
Chris@10 464 {
Chris@10 465 E T1T, T27, T24, T28, T1Y, T23;
Chris@10 466 T1T = T1R + T1S;
Chris@10 467 T27 = T25 - T26;
Chris@10 468 T1Y = T1U + T1X;
Chris@10 469 T23 = T1Z - T22;
Chris@10 470 T24 = KP707106781 * (T1Y + T23);
Chris@10 471 T28 = KP707106781 * (T23 - T1Y);
Chris@10 472 ro[WS(os, 10)] = T1T - T24;
Chris@10 473 io[WS(os, 6)] = T27 + T28;
Chris@10 474 ro[WS(os, 2)] = T1T + T24;
Chris@10 475 io[WS(os, 14)] = T27 - T28;
Chris@10 476 }
Chris@10 477 {
Chris@10 478 E T29, T2d, T2c, T2e, T2a, T2b;
Chris@10 479 T29 = T1R - T1S;
Chris@10 480 T2d = T26 + T25;
Chris@10 481 T2a = T1X - T1U;
Chris@10 482 T2b = T1Z + T22;
Chris@10 483 T2c = KP707106781 * (T2a - T2b);
Chris@10 484 T2e = KP707106781 * (T2a + T2b);
Chris@10 485 ro[WS(os, 14)] = T29 - T2c;
Chris@10 486 io[WS(os, 2)] = T2d + T2e;
Chris@10 487 ro[WS(os, 6)] = T29 + T2c;
Chris@10 488 io[WS(os, 10)] = T2d - T2e;
Chris@10 489 }
Chris@10 490 {
Chris@10 491 E TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
Chris@10 492 TU = KP707106781 * (TQ - TT);
Chris@10 493 TV = TN + TU;
Chris@10 494 T1r = TN - TU;
Chris@10 495 T1o = KP707106781 * (T1m - T1n);
Chris@10 496 T1p = T1l - T1o;
Chris@10 497 T1v = T1l + T1o;
Chris@10 498 {
Chris@10 499 E T16, T1h, T1s, T1t;
Chris@10 500 T16 = FMA(KP923879532, T10, KP382683432 * T15);
Chris@10 501 T1h = FNMS(KP923879532, T1g, KP382683432 * T1b);
Chris@10 502 T1i = T16 + T1h;
Chris@10 503 T1q = T1h - T16;
Chris@10 504 T1s = FNMS(KP923879532, T15, KP382683432 * T10);
Chris@10 505 T1t = FMA(KP382683432, T1g, KP923879532 * T1b);
Chris@10 506 T1u = T1s - T1t;
Chris@10 507 T1w = T1s + T1t;
Chris@10 508 }
Chris@10 509 ro[WS(os, 11)] = TV - T1i;
Chris@10 510 io[WS(os, 11)] = T1v - T1w;
Chris@10 511 ro[WS(os, 3)] = TV + T1i;
Chris@10 512 io[WS(os, 3)] = T1v + T1w;
Chris@10 513 io[WS(os, 15)] = T1p - T1q;
Chris@10 514 ro[WS(os, 15)] = T1r - T1u;
Chris@10 515 io[WS(os, 7)] = T1p + T1q;
Chris@10 516 ro[WS(os, 7)] = T1r + T1u;
Chris@10 517 }
Chris@10 518 {
Chris@10 519 E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
Chris@10 520 T1y = KP707106781 * (T1n + T1m);
Chris@10 521 T1z = T1x + T1y;
Chris@10 522 T1L = T1x - T1y;
Chris@10 523 T1I = KP707106781 * (TQ + TT);
Chris@10 524 T1J = T1H - T1I;
Chris@10 525 T1P = T1H + T1I;
Chris@10 526 {
Chris@10 527 E T1C, T1F, T1M, T1N;
Chris@10 528 T1C = FMA(KP382683432, T1A, KP923879532 * T1B);
Chris@10 529 T1F = FNMS(KP382683432, T1E, KP923879532 * T1D);
Chris@10 530 T1G = T1C + T1F;
Chris@10 531 T1K = T1F - T1C;
Chris@10 532 T1M = FNMS(KP382683432, T1B, KP923879532 * T1A);
Chris@10 533 T1N = FMA(KP923879532, T1E, KP382683432 * T1D);
Chris@10 534 T1O = T1M - T1N;
Chris@10 535 T1Q = T1M + T1N;
Chris@10 536 }
Chris@10 537 ro[WS(os, 9)] = T1z - T1G;
Chris@10 538 io[WS(os, 9)] = T1P - T1Q;
Chris@10 539 ro[WS(os, 1)] = T1z + T1G;
Chris@10 540 io[WS(os, 1)] = T1P + T1Q;
Chris@10 541 io[WS(os, 13)] = T1J - T1K;
Chris@10 542 ro[WS(os, 13)] = T1L - T1O;
Chris@10 543 io[WS(os, 5)] = T1J + T1K;
Chris@10 544 ro[WS(os, 5)] = T1L + T1O;
Chris@10 545 }
Chris@10 546 }
Chris@10 547 }
Chris@10 548 }
Chris@10 549
Chris@10 550 static const kdft_desc desc = { 16, "n1_16", {136, 16, 8, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 551
Chris@10 552 void X(codelet_n1_16) (planner *p) {
Chris@10 553 X(kdft_register) (p, n1_16, &desc);
Chris@10 554 }
Chris@10 555
Chris@10 556 #endif /* HAVE_FMA */