annotate src/fftw-3.3.5/dft/scalar/codelets/n1_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:35:52 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include n.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 96 FP additions, 24 FP multiplications,
Chris@42 32 * (or, 72 additions, 0 multiplications, 24 fused multiply/add),
Chris@42 33 * 63 stack variables, 2 constants, and 48 memory accesses
Chris@42 34 */
Chris@42 35 #include "n.h"
Chris@42 36
Chris@42 37 static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT i;
Chris@42 43 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
Chris@42 44 E TT, TW, TF, T1q, TY, TQ, TX, T1n;
Chris@42 45 {
Chris@42 46 E TA, TS, TR, T5, Ts, Tz, TD, TV, TU, Ta, Tx, TC, T1d, Th, TJ;
Chris@42 47 E TG, Tg, T1u, T1c, T1f, TM, TN, Tk, T1i;
Chris@42 48 {
Chris@42 49 E T6, Tt, Tu, Tv, T9;
Chris@42 50 {
Chris@42 51 E T1, To, Tp, Tq, T4, T2, T3, T7, T8, Tr;
Chris@42 52 T1 = ri[0];
Chris@42 53 T2 = ri[WS(is, 4)];
Chris@42 54 T3 = ri[WS(is, 8)];
Chris@42 55 To = ii[0];
Chris@42 56 Tp = ii[WS(is, 4)];
Chris@42 57 Tq = ii[WS(is, 8)];
Chris@42 58 T4 = T2 + T3;
Chris@42 59 TA = T3 - T2;
Chris@42 60 T6 = ri[WS(is, 6)];
Chris@42 61 TS = Tp - Tq;
Chris@42 62 Tr = Tp + Tq;
Chris@42 63 TR = FNMS(KP500000000, T4, T1);
Chris@42 64 T5 = T1 + T4;
Chris@42 65 T7 = ri[WS(is, 10)];
Chris@42 66 Ts = To + Tr;
Chris@42 67 Tz = FNMS(KP500000000, Tr, To);
Chris@42 68 T8 = ri[WS(is, 2)];
Chris@42 69 Tt = ii[WS(is, 6)];
Chris@42 70 Tu = ii[WS(is, 10)];
Chris@42 71 Tv = ii[WS(is, 2)];
Chris@42 72 T9 = T7 + T8;
Chris@42 73 TD = T8 - T7;
Chris@42 74 }
Chris@42 75 {
Chris@42 76 E Tc, T1a, TH, TI, Tf, Td, Te, Tw, Ti, Tj, T1b;
Chris@42 77 Tc = ri[WS(is, 3)];
Chris@42 78 TV = Tu - Tv;
Chris@42 79 Tw = Tu + Tv;
Chris@42 80 TU = FNMS(KP500000000, T9, T6);
Chris@42 81 Ta = T6 + T9;
Chris@42 82 Td = ri[WS(is, 7)];
Chris@42 83 Tx = Tt + Tw;
Chris@42 84 TC = FNMS(KP500000000, Tw, Tt);
Chris@42 85 Te = ri[WS(is, 11)];
Chris@42 86 T1a = ii[WS(is, 3)];
Chris@42 87 TH = ii[WS(is, 7)];
Chris@42 88 TI = ii[WS(is, 11)];
Chris@42 89 Tf = Td + Te;
Chris@42 90 T1d = Te - Td;
Chris@42 91 Th = ri[WS(is, 9)];
Chris@42 92 TJ = TH - TI;
Chris@42 93 T1b = TH + TI;
Chris@42 94 TG = FNMS(KP500000000, Tf, Tc);
Chris@42 95 Tg = Tc + Tf;
Chris@42 96 Ti = ri[WS(is, 1)];
Chris@42 97 T1u = T1a + T1b;
Chris@42 98 T1c = FNMS(KP500000000, T1b, T1a);
Chris@42 99 Tj = ri[WS(is, 5)];
Chris@42 100 T1f = ii[WS(is, 9)];
Chris@42 101 TM = ii[WS(is, 1)];
Chris@42 102 TN = ii[WS(is, 5)];
Chris@42 103 Tk = Ti + Tj;
Chris@42 104 T1i = Tj - Ti;
Chris@42 105 }
Chris@42 106 }
Chris@42 107 {
Chris@42 108 E T1t, TO, TL, T1h, T1w, Tb, T1g, Tl;
Chris@42 109 T1t = T5 - Ta;
Chris@42 110 Tb = T5 + Ta;
Chris@42 111 TO = TM - TN;
Chris@42 112 T1g = TM + TN;
Chris@42 113 TL = FNMS(KP500000000, Tk, Th);
Chris@42 114 Tl = Th + Tk;
Chris@42 115 {
Chris@42 116 E T1x, Ty, T1v, Tn, Tm, T1y;
Chris@42 117 T1x = Ts + Tx;
Chris@42 118 Ty = Ts - Tx;
Chris@42 119 T1v = T1f + T1g;
Chris@42 120 T1h = FNMS(KP500000000, T1g, T1f);
Chris@42 121 Tn = Tg - Tl;
Chris@42 122 Tm = Tg + Tl;
Chris@42 123 T1y = T1u + T1v;
Chris@42 124 T1w = T1u - T1v;
Chris@42 125 ro[0] = Tb + Tm;
Chris@42 126 ro[WS(os, 6)] = Tb - Tm;
Chris@42 127 io[WS(os, 3)] = Tn + Ty;
Chris@42 128 io[0] = T1x + T1y;
Chris@42 129 io[WS(os, 6)] = T1x - T1y;
Chris@42 130 io[WS(os, 9)] = Ty - Tn;
Chris@42 131 }
Chris@42 132 {
Chris@42 133 E TB, TE, T1o, T11, T1p, TK, TP, T15, T1k, T18, T14, T16, T1l, T1m;
Chris@42 134 {
Chris@42 135 E T1e, T1j, TZ, T10, T12, T13;
Chris@42 136 TB = FNMS(KP866025403, TA, Tz);
Chris@42 137 TZ = FMA(KP866025403, TA, Tz);
Chris@42 138 T10 = FMA(KP866025403, TD, TC);
Chris@42 139 TE = FNMS(KP866025403, TD, TC);
Chris@42 140 T1o = FNMS(KP866025403, T1d, T1c);
Chris@42 141 T1e = FMA(KP866025403, T1d, T1c);
Chris@42 142 ro[WS(os, 9)] = T1t + T1w;
Chris@42 143 ro[WS(os, 3)] = T1t - T1w;
Chris@42 144 T1l = TZ + T10;
Chris@42 145 T11 = TZ - T10;
Chris@42 146 T1j = FMA(KP866025403, T1i, T1h);
Chris@42 147 T1p = FNMS(KP866025403, T1i, T1h);
Chris@42 148 TK = FNMS(KP866025403, TJ, TG);
Chris@42 149 T12 = FMA(KP866025403, TJ, TG);
Chris@42 150 T13 = FMA(KP866025403, TO, TL);
Chris@42 151 TP = FNMS(KP866025403, TO, TL);
Chris@42 152 TT = FNMS(KP866025403, TS, TR);
Chris@42 153 T15 = FMA(KP866025403, TS, TR);
Chris@42 154 T1m = T1e + T1j;
Chris@42 155 T1k = T1e - T1j;
Chris@42 156 T18 = T12 + T13;
Chris@42 157 T14 = T12 - T13;
Chris@42 158 T16 = FMA(KP866025403, TV, TU);
Chris@42 159 TW = FNMS(KP866025403, TV, TU);
Chris@42 160 }
Chris@42 161 io[WS(os, 10)] = T1l - T1m;
Chris@42 162 io[WS(os, 4)] = T1l + T1m;
Chris@42 163 io[WS(os, 7)] = T11 + T14;
Chris@42 164 io[WS(os, 1)] = T11 - T14;
Chris@42 165 {
Chris@42 166 E T17, T19, T1r, T1s;
Chris@42 167 T17 = T15 + T16;
Chris@42 168 T19 = T15 - T16;
Chris@42 169 ro[WS(os, 7)] = T19 - T1k;
Chris@42 170 ro[WS(os, 1)] = T19 + T1k;
Chris@42 171 ro[WS(os, 4)] = T17 + T18;
Chris@42 172 ro[WS(os, 10)] = T17 - T18;
Chris@42 173 T1r = TB + TE;
Chris@42 174 TF = TB - TE;
Chris@42 175 T1s = T1o + T1p;
Chris@42 176 T1q = T1o - T1p;
Chris@42 177 TY = TK + TP;
Chris@42 178 TQ = TK - TP;
Chris@42 179 io[WS(os, 2)] = T1r - T1s;
Chris@42 180 io[WS(os, 8)] = T1r + T1s;
Chris@42 181 }
Chris@42 182 }
Chris@42 183 }
Chris@42 184 }
Chris@42 185 io[WS(os, 11)] = TF + TQ;
Chris@42 186 io[WS(os, 5)] = TF - TQ;
Chris@42 187 TX = TT + TW;
Chris@42 188 T1n = TT - TW;
Chris@42 189 ro[WS(os, 11)] = T1n - T1q;
Chris@42 190 ro[WS(os, 5)] = T1n + T1q;
Chris@42 191 ro[WS(os, 8)] = TX + TY;
Chris@42 192 ro[WS(os, 2)] = TX - TY;
Chris@42 193 }
Chris@42 194 }
Chris@42 195 }
Chris@42 196
Chris@42 197 static const kdft_desc desc = { 12, "n1_12", {72, 0, 24, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 198
Chris@42 199 void X(codelet_n1_12) (planner *p) {
Chris@42 200 X(kdft_register) (p, n1_12, &desc);
Chris@42 201 }
Chris@42 202
Chris@42 203 #else /* HAVE_FMA */
Chris@42 204
Chris@42 205 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include n.h */
Chris@42 206
Chris@42 207 /*
Chris@42 208 * This function contains 96 FP additions, 16 FP multiplications,
Chris@42 209 * (or, 88 additions, 8 multiplications, 8 fused multiply/add),
Chris@42 210 * 43 stack variables, 2 constants, and 48 memory accesses
Chris@42 211 */
Chris@42 212 #include "n.h"
Chris@42 213
Chris@42 214 static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 215 {
Chris@42 216 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 217 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 218 {
Chris@42 219 INT i;
Chris@42 220 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
Chris@42 221 E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1a, TG;
Chris@42 222 E TJ, T1u, T1d, Tl, T1f, TL, TO, T1v, T1i;
Chris@42 223 {
Chris@42 224 E T1, T2, T3, T4;
Chris@42 225 T1 = ri[0];
Chris@42 226 T2 = ri[WS(is, 4)];
Chris@42 227 T3 = ri[WS(is, 8)];
Chris@42 228 T4 = T2 + T3;
Chris@42 229 T5 = T1 + T4;
Chris@42 230 TR = FNMS(KP500000000, T4, T1);
Chris@42 231 TA = KP866025403 * (T3 - T2);
Chris@42 232 }
Chris@42 233 {
Chris@42 234 E To, Tp, Tq, Tr;
Chris@42 235 To = ii[0];
Chris@42 236 Tp = ii[WS(is, 4)];
Chris@42 237 Tq = ii[WS(is, 8)];
Chris@42 238 Tr = Tp + Tq;
Chris@42 239 Ts = To + Tr;
Chris@42 240 TS = KP866025403 * (Tp - Tq);
Chris@42 241 Tz = FNMS(KP500000000, Tr, To);
Chris@42 242 }
Chris@42 243 {
Chris@42 244 E T6, T7, T8, T9;
Chris@42 245 T6 = ri[WS(is, 6)];
Chris@42 246 T7 = ri[WS(is, 10)];
Chris@42 247 T8 = ri[WS(is, 2)];
Chris@42 248 T9 = T7 + T8;
Chris@42 249 Ta = T6 + T9;
Chris@42 250 TU = FNMS(KP500000000, T9, T6);
Chris@42 251 TD = KP866025403 * (T8 - T7);
Chris@42 252 }
Chris@42 253 {
Chris@42 254 E Tt, Tu, Tv, Tw;
Chris@42 255 Tt = ii[WS(is, 6)];
Chris@42 256 Tu = ii[WS(is, 10)];
Chris@42 257 Tv = ii[WS(is, 2)];
Chris@42 258 Tw = Tu + Tv;
Chris@42 259 Tx = Tt + Tw;
Chris@42 260 TV = KP866025403 * (Tu - Tv);
Chris@42 261 TC = FNMS(KP500000000, Tw, Tt);
Chris@42 262 }
Chris@42 263 {
Chris@42 264 E Tc, Td, Te, Tf;
Chris@42 265 Tc = ri[WS(is, 3)];
Chris@42 266 Td = ri[WS(is, 7)];
Chris@42 267 Te = ri[WS(is, 11)];
Chris@42 268 Tf = Td + Te;
Chris@42 269 Tg = Tc + Tf;
Chris@42 270 T1a = KP866025403 * (Te - Td);
Chris@42 271 TG = FNMS(KP500000000, Tf, Tc);
Chris@42 272 }
Chris@42 273 {
Chris@42 274 E T1b, TH, TI, T1c;
Chris@42 275 T1b = ii[WS(is, 3)];
Chris@42 276 TH = ii[WS(is, 7)];
Chris@42 277 TI = ii[WS(is, 11)];
Chris@42 278 T1c = TH + TI;
Chris@42 279 TJ = KP866025403 * (TH - TI);
Chris@42 280 T1u = T1b + T1c;
Chris@42 281 T1d = FNMS(KP500000000, T1c, T1b);
Chris@42 282 }
Chris@42 283 {
Chris@42 284 E Th, Ti, Tj, Tk;
Chris@42 285 Th = ri[WS(is, 9)];
Chris@42 286 Ti = ri[WS(is, 1)];
Chris@42 287 Tj = ri[WS(is, 5)];
Chris@42 288 Tk = Ti + Tj;
Chris@42 289 Tl = Th + Tk;
Chris@42 290 T1f = KP866025403 * (Tj - Ti);
Chris@42 291 TL = FNMS(KP500000000, Tk, Th);
Chris@42 292 }
Chris@42 293 {
Chris@42 294 E T1g, TM, TN, T1h;
Chris@42 295 T1g = ii[WS(is, 9)];
Chris@42 296 TM = ii[WS(is, 1)];
Chris@42 297 TN = ii[WS(is, 5)];
Chris@42 298 T1h = TM + TN;
Chris@42 299 TO = KP866025403 * (TM - TN);
Chris@42 300 T1v = T1g + T1h;
Chris@42 301 T1i = FNMS(KP500000000, T1h, T1g);
Chris@42 302 }
Chris@42 303 {
Chris@42 304 E Tb, Tm, T1t, T1w;
Chris@42 305 Tb = T5 + Ta;
Chris@42 306 Tm = Tg + Tl;
Chris@42 307 ro[WS(os, 6)] = Tb - Tm;
Chris@42 308 ro[0] = Tb + Tm;
Chris@42 309 {
Chris@42 310 E T1x, T1y, Tn, Ty;
Chris@42 311 T1x = Ts + Tx;
Chris@42 312 T1y = T1u + T1v;
Chris@42 313 io[WS(os, 6)] = T1x - T1y;
Chris@42 314 io[0] = T1x + T1y;
Chris@42 315 Tn = Tg - Tl;
Chris@42 316 Ty = Ts - Tx;
Chris@42 317 io[WS(os, 3)] = Tn + Ty;
Chris@42 318 io[WS(os, 9)] = Ty - Tn;
Chris@42 319 }
Chris@42 320 T1t = T5 - Ta;
Chris@42 321 T1w = T1u - T1v;
Chris@42 322 ro[WS(os, 3)] = T1t - T1w;
Chris@42 323 ro[WS(os, 9)] = T1t + T1w;
Chris@42 324 {
Chris@42 325 E T11, T1l, T1k, T1m, T14, T18, T17, T19;
Chris@42 326 {
Chris@42 327 E TZ, T10, T1e, T1j;
Chris@42 328 TZ = TA + Tz;
Chris@42 329 T10 = TD + TC;
Chris@42 330 T11 = TZ - T10;
Chris@42 331 T1l = TZ + T10;
Chris@42 332 T1e = T1a + T1d;
Chris@42 333 T1j = T1f + T1i;
Chris@42 334 T1k = T1e - T1j;
Chris@42 335 T1m = T1e + T1j;
Chris@42 336 }
Chris@42 337 {
Chris@42 338 E T12, T13, T15, T16;
Chris@42 339 T12 = TG + TJ;
Chris@42 340 T13 = TL + TO;
Chris@42 341 T14 = T12 - T13;
Chris@42 342 T18 = T12 + T13;
Chris@42 343 T15 = TR + TS;
Chris@42 344 T16 = TU + TV;
Chris@42 345 T17 = T15 + T16;
Chris@42 346 T19 = T15 - T16;
Chris@42 347 }
Chris@42 348 io[WS(os, 1)] = T11 - T14;
Chris@42 349 ro[WS(os, 1)] = T19 + T1k;
Chris@42 350 io[WS(os, 7)] = T11 + T14;
Chris@42 351 ro[WS(os, 7)] = T19 - T1k;
Chris@42 352 ro[WS(os, 10)] = T17 - T18;
Chris@42 353 io[WS(os, 10)] = T1l - T1m;
Chris@42 354 ro[WS(os, 4)] = T17 + T18;
Chris@42 355 io[WS(os, 4)] = T1l + T1m;
Chris@42 356 }
Chris@42 357 {
Chris@42 358 E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
Chris@42 359 {
Chris@42 360 E TB, TE, T1o, T1p;
Chris@42 361 TB = Tz - TA;
Chris@42 362 TE = TC - TD;
Chris@42 363 TF = TB - TE;
Chris@42 364 T1r = TB + TE;
Chris@42 365 T1o = T1d - T1a;
Chris@42 366 T1p = T1i - T1f;
Chris@42 367 T1q = T1o - T1p;
Chris@42 368 T1s = T1o + T1p;
Chris@42 369 }
Chris@42 370 {
Chris@42 371 E TK, TP, TT, TW;
Chris@42 372 TK = TG - TJ;
Chris@42 373 TP = TL - TO;
Chris@42 374 TQ = TK - TP;
Chris@42 375 TY = TK + TP;
Chris@42 376 TT = TR - TS;
Chris@42 377 TW = TU - TV;
Chris@42 378 TX = TT + TW;
Chris@42 379 T1n = TT - TW;
Chris@42 380 }
Chris@42 381 io[WS(os, 5)] = TF - TQ;
Chris@42 382 ro[WS(os, 5)] = T1n + T1q;
Chris@42 383 io[WS(os, 11)] = TF + TQ;
Chris@42 384 ro[WS(os, 11)] = T1n - T1q;
Chris@42 385 ro[WS(os, 2)] = TX - TY;
Chris@42 386 io[WS(os, 2)] = T1r - T1s;
Chris@42 387 ro[WS(os, 8)] = TX + TY;
Chris@42 388 io[WS(os, 8)] = T1r + T1s;
Chris@42 389 }
Chris@42 390 }
Chris@42 391 }
Chris@42 392 }
Chris@42 393 }
Chris@42 394
Chris@42 395 static const kdft_desc desc = { 12, "n1_12", {88, 8, 8, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 396
Chris@42 397 void X(codelet_n1_12) (planner *p) {
Chris@42 398 X(kdft_register) (p, n1_12, &desc);
Chris@42 399 }
Chris@42 400
Chris@42 401 #endif /* HAVE_FMA */