annotate src/fftw-3.3.8/dft/scalar/codelets/n1_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:10 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 96 FP additions, 24 FP multiplications,
Chris@82 32 * (or, 72 additions, 0 multiplications, 24 fused multiply/add),
Chris@82 33 * 43 stack variables, 2 constants, and 48 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/n.h"
Chris@82 36
Chris@82 37 static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT i;
Chris@82 43 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
Chris@82 44 E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1d, TG;
Chris@82 45 E TJ, T1u, T1c, Tl, T1i, TL, TO, T1v, T1h;
Chris@82 46 {
Chris@82 47 E T1, T2, T3, T4;
Chris@82 48 T1 = ri[0];
Chris@82 49 T2 = ri[WS(is, 4)];
Chris@82 50 T3 = ri[WS(is, 8)];
Chris@82 51 T4 = T2 + T3;
Chris@82 52 T5 = T1 + T4;
Chris@82 53 TR = FNMS(KP500000000, T4, T1);
Chris@82 54 TA = T3 - T2;
Chris@82 55 }
Chris@82 56 {
Chris@82 57 E To, Tp, Tq, Tr;
Chris@82 58 To = ii[0];
Chris@82 59 Tp = ii[WS(is, 4)];
Chris@82 60 Tq = ii[WS(is, 8)];
Chris@82 61 Tr = Tp + Tq;
Chris@82 62 Ts = To + Tr;
Chris@82 63 TS = Tp - Tq;
Chris@82 64 Tz = FNMS(KP500000000, Tr, To);
Chris@82 65 }
Chris@82 66 {
Chris@82 67 E T6, T7, T8, T9;
Chris@82 68 T6 = ri[WS(is, 6)];
Chris@82 69 T7 = ri[WS(is, 10)];
Chris@82 70 T8 = ri[WS(is, 2)];
Chris@82 71 T9 = T7 + T8;
Chris@82 72 Ta = T6 + T9;
Chris@82 73 TU = FNMS(KP500000000, T9, T6);
Chris@82 74 TD = T8 - T7;
Chris@82 75 }
Chris@82 76 {
Chris@82 77 E Tt, Tu, Tv, Tw;
Chris@82 78 Tt = ii[WS(is, 6)];
Chris@82 79 Tu = ii[WS(is, 10)];
Chris@82 80 Tv = ii[WS(is, 2)];
Chris@82 81 Tw = Tu + Tv;
Chris@82 82 Tx = Tt + Tw;
Chris@82 83 TV = Tu - Tv;
Chris@82 84 TC = FNMS(KP500000000, Tw, Tt);
Chris@82 85 }
Chris@82 86 {
Chris@82 87 E Tc, Td, Te, Tf;
Chris@82 88 Tc = ri[WS(is, 3)];
Chris@82 89 Td = ri[WS(is, 7)];
Chris@82 90 Te = ri[WS(is, 11)];
Chris@82 91 Tf = Td + Te;
Chris@82 92 Tg = Tc + Tf;
Chris@82 93 T1d = Te - Td;
Chris@82 94 TG = FNMS(KP500000000, Tf, Tc);
Chris@82 95 }
Chris@82 96 {
Chris@82 97 E T1a, TH, TI, T1b;
Chris@82 98 T1a = ii[WS(is, 3)];
Chris@82 99 TH = ii[WS(is, 7)];
Chris@82 100 TI = ii[WS(is, 11)];
Chris@82 101 T1b = TH + TI;
Chris@82 102 TJ = TH - TI;
Chris@82 103 T1u = T1a + T1b;
Chris@82 104 T1c = FNMS(KP500000000, T1b, T1a);
Chris@82 105 }
Chris@82 106 {
Chris@82 107 E Th, Ti, Tj, Tk;
Chris@82 108 Th = ri[WS(is, 9)];
Chris@82 109 Ti = ri[WS(is, 1)];
Chris@82 110 Tj = ri[WS(is, 5)];
Chris@82 111 Tk = Ti + Tj;
Chris@82 112 Tl = Th + Tk;
Chris@82 113 T1i = Tj - Ti;
Chris@82 114 TL = FNMS(KP500000000, Tk, Th);
Chris@82 115 }
Chris@82 116 {
Chris@82 117 E T1f, TM, TN, T1g;
Chris@82 118 T1f = ii[WS(is, 9)];
Chris@82 119 TM = ii[WS(is, 1)];
Chris@82 120 TN = ii[WS(is, 5)];
Chris@82 121 T1g = TM + TN;
Chris@82 122 TO = TM - TN;
Chris@82 123 T1v = T1f + T1g;
Chris@82 124 T1h = FNMS(KP500000000, T1g, T1f);
Chris@82 125 }
Chris@82 126 {
Chris@82 127 E Tb, Tm, T1t, T1w;
Chris@82 128 Tb = T5 + Ta;
Chris@82 129 Tm = Tg + Tl;
Chris@82 130 ro[WS(os, 6)] = Tb - Tm;
Chris@82 131 ro[0] = Tb + Tm;
Chris@82 132 {
Chris@82 133 E T1x, T1y, Tn, Ty;
Chris@82 134 T1x = Ts + Tx;
Chris@82 135 T1y = T1u + T1v;
Chris@82 136 io[WS(os, 6)] = T1x - T1y;
Chris@82 137 io[0] = T1x + T1y;
Chris@82 138 Tn = Tg - Tl;
Chris@82 139 Ty = Ts - Tx;
Chris@82 140 io[WS(os, 3)] = Tn + Ty;
Chris@82 141 io[WS(os, 9)] = Ty - Tn;
Chris@82 142 }
Chris@82 143 T1t = T5 - Ta;
Chris@82 144 T1w = T1u - T1v;
Chris@82 145 ro[WS(os, 3)] = T1t - T1w;
Chris@82 146 ro[WS(os, 9)] = T1t + T1w;
Chris@82 147 {
Chris@82 148 E T11, T1l, T1k, T1m, T14, T18, T17, T19;
Chris@82 149 {
Chris@82 150 E TZ, T10, T1e, T1j;
Chris@82 151 TZ = FMA(KP866025403, TA, Tz);
Chris@82 152 T10 = FMA(KP866025403, TD, TC);
Chris@82 153 T11 = TZ - T10;
Chris@82 154 T1l = TZ + T10;
Chris@82 155 T1e = FMA(KP866025403, T1d, T1c);
Chris@82 156 T1j = FMA(KP866025403, T1i, T1h);
Chris@82 157 T1k = T1e - T1j;
Chris@82 158 T1m = T1e + T1j;
Chris@82 159 }
Chris@82 160 {
Chris@82 161 E T12, T13, T15, T16;
Chris@82 162 T12 = FMA(KP866025403, TJ, TG);
Chris@82 163 T13 = FMA(KP866025403, TO, TL);
Chris@82 164 T14 = T12 - T13;
Chris@82 165 T18 = T12 + T13;
Chris@82 166 T15 = FMA(KP866025403, TS, TR);
Chris@82 167 T16 = FMA(KP866025403, TV, TU);
Chris@82 168 T17 = T15 + T16;
Chris@82 169 T19 = T15 - T16;
Chris@82 170 }
Chris@82 171 io[WS(os, 1)] = T11 - T14;
Chris@82 172 ro[WS(os, 1)] = T19 + T1k;
Chris@82 173 io[WS(os, 7)] = T11 + T14;
Chris@82 174 ro[WS(os, 7)] = T19 - T1k;
Chris@82 175 ro[WS(os, 10)] = T17 - T18;
Chris@82 176 io[WS(os, 10)] = T1l - T1m;
Chris@82 177 ro[WS(os, 4)] = T17 + T18;
Chris@82 178 io[WS(os, 4)] = T1l + T1m;
Chris@82 179 }
Chris@82 180 {
Chris@82 181 E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
Chris@82 182 {
Chris@82 183 E TB, TE, T1o, T1p;
Chris@82 184 TB = FNMS(KP866025403, TA, Tz);
Chris@82 185 TE = FNMS(KP866025403, TD, TC);
Chris@82 186 TF = TB - TE;
Chris@82 187 T1r = TB + TE;
Chris@82 188 T1o = FNMS(KP866025403, T1d, T1c);
Chris@82 189 T1p = FNMS(KP866025403, T1i, T1h);
Chris@82 190 T1q = T1o - T1p;
Chris@82 191 T1s = T1o + T1p;
Chris@82 192 }
Chris@82 193 {
Chris@82 194 E TK, TP, TT, TW;
Chris@82 195 TK = FNMS(KP866025403, TJ, TG);
Chris@82 196 TP = FNMS(KP866025403, TO, TL);
Chris@82 197 TQ = TK - TP;
Chris@82 198 TY = TK + TP;
Chris@82 199 TT = FNMS(KP866025403, TS, TR);
Chris@82 200 TW = FNMS(KP866025403, TV, TU);
Chris@82 201 TX = TT + TW;
Chris@82 202 T1n = TT - TW;
Chris@82 203 }
Chris@82 204 io[WS(os, 5)] = TF - TQ;
Chris@82 205 ro[WS(os, 5)] = T1n + T1q;
Chris@82 206 io[WS(os, 11)] = TF + TQ;
Chris@82 207 ro[WS(os, 11)] = T1n - T1q;
Chris@82 208 ro[WS(os, 2)] = TX - TY;
Chris@82 209 io[WS(os, 2)] = T1r - T1s;
Chris@82 210 ro[WS(os, 8)] = TX + TY;
Chris@82 211 io[WS(os, 8)] = T1r + T1s;
Chris@82 212 }
Chris@82 213 }
Chris@82 214 }
Chris@82 215 }
Chris@82 216 }
Chris@82 217
Chris@82 218 static const kdft_desc desc = { 12, "n1_12", {72, 0, 24, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 219
Chris@82 220 void X(codelet_n1_12) (planner *p) {
Chris@82 221 X(kdft_register) (p, n1_12, &desc);
Chris@82 222 }
Chris@82 223
Chris@82 224 #else
Chris@82 225
Chris@82 226 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
Chris@82 227
Chris@82 228 /*
Chris@82 229 * This function contains 96 FP additions, 16 FP multiplications,
Chris@82 230 * (or, 88 additions, 8 multiplications, 8 fused multiply/add),
Chris@82 231 * 43 stack variables, 2 constants, and 48 memory accesses
Chris@82 232 */
Chris@82 233 #include "dft/scalar/n.h"
Chris@82 234
Chris@82 235 static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 236 {
Chris@82 237 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 238 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 239 {
Chris@82 240 INT i;
Chris@82 241 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
Chris@82 242 E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1a, TG;
Chris@82 243 E TJ, T1u, T1d, Tl, T1f, TL, TO, T1v, T1i;
Chris@82 244 {
Chris@82 245 E T1, T2, T3, T4;
Chris@82 246 T1 = ri[0];
Chris@82 247 T2 = ri[WS(is, 4)];
Chris@82 248 T3 = ri[WS(is, 8)];
Chris@82 249 T4 = T2 + T3;
Chris@82 250 T5 = T1 + T4;
Chris@82 251 TR = FNMS(KP500000000, T4, T1);
Chris@82 252 TA = KP866025403 * (T3 - T2);
Chris@82 253 }
Chris@82 254 {
Chris@82 255 E To, Tp, Tq, Tr;
Chris@82 256 To = ii[0];
Chris@82 257 Tp = ii[WS(is, 4)];
Chris@82 258 Tq = ii[WS(is, 8)];
Chris@82 259 Tr = Tp + Tq;
Chris@82 260 Ts = To + Tr;
Chris@82 261 TS = KP866025403 * (Tp - Tq);
Chris@82 262 Tz = FNMS(KP500000000, Tr, To);
Chris@82 263 }
Chris@82 264 {
Chris@82 265 E T6, T7, T8, T9;
Chris@82 266 T6 = ri[WS(is, 6)];
Chris@82 267 T7 = ri[WS(is, 10)];
Chris@82 268 T8 = ri[WS(is, 2)];
Chris@82 269 T9 = T7 + T8;
Chris@82 270 Ta = T6 + T9;
Chris@82 271 TU = FNMS(KP500000000, T9, T6);
Chris@82 272 TD = KP866025403 * (T8 - T7);
Chris@82 273 }
Chris@82 274 {
Chris@82 275 E Tt, Tu, Tv, Tw;
Chris@82 276 Tt = ii[WS(is, 6)];
Chris@82 277 Tu = ii[WS(is, 10)];
Chris@82 278 Tv = ii[WS(is, 2)];
Chris@82 279 Tw = Tu + Tv;
Chris@82 280 Tx = Tt + Tw;
Chris@82 281 TV = KP866025403 * (Tu - Tv);
Chris@82 282 TC = FNMS(KP500000000, Tw, Tt);
Chris@82 283 }
Chris@82 284 {
Chris@82 285 E Tc, Td, Te, Tf;
Chris@82 286 Tc = ri[WS(is, 3)];
Chris@82 287 Td = ri[WS(is, 7)];
Chris@82 288 Te = ri[WS(is, 11)];
Chris@82 289 Tf = Td + Te;
Chris@82 290 Tg = Tc + Tf;
Chris@82 291 T1a = KP866025403 * (Te - Td);
Chris@82 292 TG = FNMS(KP500000000, Tf, Tc);
Chris@82 293 }
Chris@82 294 {
Chris@82 295 E T1b, TH, TI, T1c;
Chris@82 296 T1b = ii[WS(is, 3)];
Chris@82 297 TH = ii[WS(is, 7)];
Chris@82 298 TI = ii[WS(is, 11)];
Chris@82 299 T1c = TH + TI;
Chris@82 300 TJ = KP866025403 * (TH - TI);
Chris@82 301 T1u = T1b + T1c;
Chris@82 302 T1d = FNMS(KP500000000, T1c, T1b);
Chris@82 303 }
Chris@82 304 {
Chris@82 305 E Th, Ti, Tj, Tk;
Chris@82 306 Th = ri[WS(is, 9)];
Chris@82 307 Ti = ri[WS(is, 1)];
Chris@82 308 Tj = ri[WS(is, 5)];
Chris@82 309 Tk = Ti + Tj;
Chris@82 310 Tl = Th + Tk;
Chris@82 311 T1f = KP866025403 * (Tj - Ti);
Chris@82 312 TL = FNMS(KP500000000, Tk, Th);
Chris@82 313 }
Chris@82 314 {
Chris@82 315 E T1g, TM, TN, T1h;
Chris@82 316 T1g = ii[WS(is, 9)];
Chris@82 317 TM = ii[WS(is, 1)];
Chris@82 318 TN = ii[WS(is, 5)];
Chris@82 319 T1h = TM + TN;
Chris@82 320 TO = KP866025403 * (TM - TN);
Chris@82 321 T1v = T1g + T1h;
Chris@82 322 T1i = FNMS(KP500000000, T1h, T1g);
Chris@82 323 }
Chris@82 324 {
Chris@82 325 E Tb, Tm, T1t, T1w;
Chris@82 326 Tb = T5 + Ta;
Chris@82 327 Tm = Tg + Tl;
Chris@82 328 ro[WS(os, 6)] = Tb - Tm;
Chris@82 329 ro[0] = Tb + Tm;
Chris@82 330 {
Chris@82 331 E T1x, T1y, Tn, Ty;
Chris@82 332 T1x = Ts + Tx;
Chris@82 333 T1y = T1u + T1v;
Chris@82 334 io[WS(os, 6)] = T1x - T1y;
Chris@82 335 io[0] = T1x + T1y;
Chris@82 336 Tn = Tg - Tl;
Chris@82 337 Ty = Ts - Tx;
Chris@82 338 io[WS(os, 3)] = Tn + Ty;
Chris@82 339 io[WS(os, 9)] = Ty - Tn;
Chris@82 340 }
Chris@82 341 T1t = T5 - Ta;
Chris@82 342 T1w = T1u - T1v;
Chris@82 343 ro[WS(os, 3)] = T1t - T1w;
Chris@82 344 ro[WS(os, 9)] = T1t + T1w;
Chris@82 345 {
Chris@82 346 E T11, T1l, T1k, T1m, T14, T18, T17, T19;
Chris@82 347 {
Chris@82 348 E TZ, T10, T1e, T1j;
Chris@82 349 TZ = TA + Tz;
Chris@82 350 T10 = TD + TC;
Chris@82 351 T11 = TZ - T10;
Chris@82 352 T1l = TZ + T10;
Chris@82 353 T1e = T1a + T1d;
Chris@82 354 T1j = T1f + T1i;
Chris@82 355 T1k = T1e - T1j;
Chris@82 356 T1m = T1e + T1j;
Chris@82 357 }
Chris@82 358 {
Chris@82 359 E T12, T13, T15, T16;
Chris@82 360 T12 = TG + TJ;
Chris@82 361 T13 = TL + TO;
Chris@82 362 T14 = T12 - T13;
Chris@82 363 T18 = T12 + T13;
Chris@82 364 T15 = TR + TS;
Chris@82 365 T16 = TU + TV;
Chris@82 366 T17 = T15 + T16;
Chris@82 367 T19 = T15 - T16;
Chris@82 368 }
Chris@82 369 io[WS(os, 1)] = T11 - T14;
Chris@82 370 ro[WS(os, 1)] = T19 + T1k;
Chris@82 371 io[WS(os, 7)] = T11 + T14;
Chris@82 372 ro[WS(os, 7)] = T19 - T1k;
Chris@82 373 ro[WS(os, 10)] = T17 - T18;
Chris@82 374 io[WS(os, 10)] = T1l - T1m;
Chris@82 375 ro[WS(os, 4)] = T17 + T18;
Chris@82 376 io[WS(os, 4)] = T1l + T1m;
Chris@82 377 }
Chris@82 378 {
Chris@82 379 E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
Chris@82 380 {
Chris@82 381 E TB, TE, T1o, T1p;
Chris@82 382 TB = Tz - TA;
Chris@82 383 TE = TC - TD;
Chris@82 384 TF = TB - TE;
Chris@82 385 T1r = TB + TE;
Chris@82 386 T1o = T1d - T1a;
Chris@82 387 T1p = T1i - T1f;
Chris@82 388 T1q = T1o - T1p;
Chris@82 389 T1s = T1o + T1p;
Chris@82 390 }
Chris@82 391 {
Chris@82 392 E TK, TP, TT, TW;
Chris@82 393 TK = TG - TJ;
Chris@82 394 TP = TL - TO;
Chris@82 395 TQ = TK - TP;
Chris@82 396 TY = TK + TP;
Chris@82 397 TT = TR - TS;
Chris@82 398 TW = TU - TV;
Chris@82 399 TX = TT + TW;
Chris@82 400 T1n = TT - TW;
Chris@82 401 }
Chris@82 402 io[WS(os, 5)] = TF - TQ;
Chris@82 403 ro[WS(os, 5)] = T1n + T1q;
Chris@82 404 io[WS(os, 11)] = TF + TQ;
Chris@82 405 ro[WS(os, 11)] = T1n - T1q;
Chris@82 406 ro[WS(os, 2)] = TX - TY;
Chris@82 407 io[WS(os, 2)] = T1r - T1s;
Chris@82 408 ro[WS(os, 8)] = TX + TY;
Chris@82 409 io[WS(os, 8)] = T1r + T1s;
Chris@82 410 }
Chris@82 411 }
Chris@82 412 }
Chris@82 413 }
Chris@82 414 }
Chris@82 415
Chris@82 416 static const kdft_desc desc = { 12, "n1_12", {88, 8, 8, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 417
Chris@82 418 void X(codelet_n1_12) (planner *p) {
Chris@82 419 X(kdft_register) (p, n1_12, &desc);
Chris@82 420 }
Chris@82 421
Chris@82 422 #endif