annotate src/fftw-3.3.3/dft/scalar/codelets/t2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:59 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include t.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@10 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@10 33 * 64 stack variables, 1 constants, and 32 memory accesses
Chris@10 34 */
Chris@10 35 #include "t.h"
Chris@10 36
Chris@10 37 static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 40 {
Chris@10 41 INT m;
Chris@10 42 for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@10 43 E TS, T1m, TJ, T1l, T1k, Tw, T1w, T1u;
Chris@10 44 {
Chris@10 45 E T2, T3, Tl, Tn, T5, T4, Tm, Tr, T6;
Chris@10 46 T2 = W[0];
Chris@10 47 T3 = W[2];
Chris@10 48 Tl = W[4];
Chris@10 49 Tn = W[5];
Chris@10 50 T5 = W[1];
Chris@10 51 T4 = T2 * T3;
Chris@10 52 Tm = T2 * Tl;
Chris@10 53 Tr = T2 * Tn;
Chris@10 54 T6 = W[3];
Chris@10 55 {
Chris@10 56 E T1, T1s, TG, Td, T1r, Tu, TY, Tk, TW, T18, T1d, TD, TH, TA, T13;
Chris@10 57 E TE, T14;
Chris@10 58 {
Chris@10 59 E To, Ts, Tf, T7, T8, Ti, Tb, T9, Tc, TC, Ta, TF, TB, Tg, Th;
Chris@10 60 E Tj;
Chris@10 61 T1 = ri[0];
Chris@10 62 To = FMA(T5, Tn, Tm);
Chris@10 63 Ts = FNMS(T5, Tl, Tr);
Chris@10 64 Tf = FMA(T5, T6, T4);
Chris@10 65 T7 = FNMS(T5, T6, T4);
Chris@10 66 Ta = T2 * T6;
Chris@10 67 T1s = ii[0];
Chris@10 68 T8 = ri[WS(rs, 4)];
Chris@10 69 TF = Tf * Tn;
Chris@10 70 TB = Tf * Tl;
Chris@10 71 Ti = FNMS(T5, T3, Ta);
Chris@10 72 Tb = FMA(T5, T3, Ta);
Chris@10 73 T9 = T7 * T8;
Chris@10 74 Tc = ii[WS(rs, 4)];
Chris@10 75 TG = FNMS(Ti, Tl, TF);
Chris@10 76 TC = FMA(Ti, Tn, TB);
Chris@10 77 {
Chris@10 78 E Tp, T1q, Tt, Tq, TX;
Chris@10 79 Tp = ri[WS(rs, 6)];
Chris@10 80 Td = FMA(Tb, Tc, T9);
Chris@10 81 T1q = T7 * Tc;
Chris@10 82 Tt = ii[WS(rs, 6)];
Chris@10 83 Tq = To * Tp;
Chris@10 84 Tg = ri[WS(rs, 2)];
Chris@10 85 T1r = FNMS(Tb, T8, T1q);
Chris@10 86 TX = To * Tt;
Chris@10 87 Tu = FMA(Ts, Tt, Tq);
Chris@10 88 Th = Tf * Tg;
Chris@10 89 Tj = ii[WS(rs, 2)];
Chris@10 90 TY = FNMS(Ts, Tp, TX);
Chris@10 91 }
Chris@10 92 {
Chris@10 93 E TO, TQ, TN, TP, T1a, T1b;
Chris@10 94 {
Chris@10 95 E TK, TM, TL, T19, TV;
Chris@10 96 TK = ri[WS(rs, 7)];
Chris@10 97 TM = ii[WS(rs, 7)];
Chris@10 98 Tk = FMA(Ti, Tj, Th);
Chris@10 99 TV = Tf * Tj;
Chris@10 100 TL = Tl * TK;
Chris@10 101 T19 = Tl * TM;
Chris@10 102 TO = ri[WS(rs, 3)];
Chris@10 103 TW = FNMS(Ti, Tg, TV);
Chris@10 104 TQ = ii[WS(rs, 3)];
Chris@10 105 TN = FMA(Tn, TM, TL);
Chris@10 106 TP = T3 * TO;
Chris@10 107 T1a = FNMS(Tn, TK, T19);
Chris@10 108 T1b = T3 * TQ;
Chris@10 109 }
Chris@10 110 {
Chris@10 111 E Tx, Tz, Ty, T12, T1c, TR;
Chris@10 112 Tx = ri[WS(rs, 1)];
Chris@10 113 TR = FMA(T6, TQ, TP);
Chris@10 114 Tz = ii[WS(rs, 1)];
Chris@10 115 T1c = FNMS(T6, TO, T1b);
Chris@10 116 Ty = T2 * Tx;
Chris@10 117 T18 = TN - TR;
Chris@10 118 TS = TN + TR;
Chris@10 119 T12 = T2 * Tz;
Chris@10 120 T1d = T1a - T1c;
Chris@10 121 T1m = T1a + T1c;
Chris@10 122 TD = ri[WS(rs, 5)];
Chris@10 123 TH = ii[WS(rs, 5)];
Chris@10 124 TA = FMA(T5, Tz, Ty);
Chris@10 125 T13 = FNMS(T5, Tx, T12);
Chris@10 126 TE = TC * TD;
Chris@10 127 T14 = TC * TH;
Chris@10 128 }
Chris@10 129 }
Chris@10 130 }
Chris@10 131 {
Chris@10 132 E Te, T1p, T1t, Tv;
Chris@10 133 {
Chris@10 134 E T1g, T10, T1z, T1B, T1A, T1j, T1C, T1f;
Chris@10 135 {
Chris@10 136 E T1x, T11, T16, T1y;
Chris@10 137 {
Chris@10 138 E TU, TZ, TI, T15;
Chris@10 139 Te = T1 + Td;
Chris@10 140 TU = T1 - Td;
Chris@10 141 TZ = TW - TY;
Chris@10 142 T1p = TW + TY;
Chris@10 143 TI = FMA(TG, TH, TE);
Chris@10 144 T15 = FNMS(TG, TD, T14);
Chris@10 145 T1t = T1r + T1s;
Chris@10 146 T1x = T1s - T1r;
Chris@10 147 T1g = TU - TZ;
Chris@10 148 T10 = TU + TZ;
Chris@10 149 T11 = TA - TI;
Chris@10 150 TJ = TA + TI;
Chris@10 151 T1l = T13 + T15;
Chris@10 152 T16 = T13 - T15;
Chris@10 153 T1y = Tk - Tu;
Chris@10 154 Tv = Tk + Tu;
Chris@10 155 }
Chris@10 156 {
Chris@10 157 E T1i, T1e, T17, T1h;
Chris@10 158 T1i = T18 + T1d;
Chris@10 159 T1e = T18 - T1d;
Chris@10 160 T17 = T11 + T16;
Chris@10 161 T1h = T16 - T11;
Chris@10 162 T1z = T1x - T1y;
Chris@10 163 T1B = T1y + T1x;
Chris@10 164 T1A = T1h + T1i;
Chris@10 165 T1j = T1h - T1i;
Chris@10 166 T1C = T1e - T17;
Chris@10 167 T1f = T17 + T1e;
Chris@10 168 }
Chris@10 169 }
Chris@10 170 ri[WS(rs, 7)] = FNMS(KP707106781, T1j, T1g);
Chris@10 171 ii[WS(rs, 7)] = FNMS(KP707106781, T1C, T1B);
Chris@10 172 ri[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
Chris@10 173 ri[WS(rs, 5)] = FNMS(KP707106781, T1f, T10);
Chris@10 174 ii[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
Chris@10 175 ii[WS(rs, 5)] = FNMS(KP707106781, T1A, T1z);
Chris@10 176 ri[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
Chris@10 177 ii[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
Chris@10 178 }
Chris@10 179 T1k = Te - Tv;
Chris@10 180 Tw = Te + Tv;
Chris@10 181 T1w = T1t - T1p;
Chris@10 182 T1u = T1p + T1t;
Chris@10 183 }
Chris@10 184 }
Chris@10 185 }
Chris@10 186 {
Chris@10 187 E TT, T1v, T1n, T1o;
Chris@10 188 TT = TJ + TS;
Chris@10 189 T1v = TS - TJ;
Chris@10 190 T1n = T1l - T1m;
Chris@10 191 T1o = T1l + T1m;
Chris@10 192 ii[WS(rs, 2)] = T1v + T1w;
Chris@10 193 ii[WS(rs, 6)] = T1w - T1v;
Chris@10 194 ri[0] = Tw + TT;
Chris@10 195 ri[WS(rs, 4)] = Tw - TT;
Chris@10 196 ii[0] = T1o + T1u;
Chris@10 197 ii[WS(rs, 4)] = T1u - T1o;
Chris@10 198 ri[WS(rs, 2)] = T1k + T1n;
Chris@10 199 ri[WS(rs, 6)] = T1k - T1n;
Chris@10 200 }
Chris@10 201 }
Chris@10 202 }
Chris@10 203 }
Chris@10 204
Chris@10 205 static const tw_instr twinstr[] = {
Chris@10 206 {TW_CEXP, 0, 1},
Chris@10 207 {TW_CEXP, 0, 3},
Chris@10 208 {TW_CEXP, 0, 7},
Chris@10 209 {TW_NEXT, 1, 0}
Chris@10 210 };
Chris@10 211
Chris@10 212 static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, {44, 20, 30, 0}, 0, 0, 0 };
Chris@10 213
Chris@10 214 void X(codelet_t2_8) (planner *p) {
Chris@10 215 X(kdft_dit_register) (p, t2_8, &desc);
Chris@10 216 }
Chris@10 217 #else /* HAVE_FMA */
Chris@10 218
Chris@10 219 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include t.h */
Chris@10 220
Chris@10 221 /*
Chris@10 222 * This function contains 74 FP additions, 44 FP multiplications,
Chris@10 223 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@10 224 * 42 stack variables, 1 constants, and 32 memory accesses
Chris@10 225 */
Chris@10 226 #include "t.h"
Chris@10 227
Chris@10 228 static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 229 {
Chris@10 230 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 231 {
Chris@10 232 INT m;
Chris@10 233 for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@10 234 E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
Chris@10 235 {
Chris@10 236 E T4, Tb, T7, Ta;
Chris@10 237 T2 = W[0];
Chris@10 238 T5 = W[1];
Chris@10 239 T3 = W[2];
Chris@10 240 T6 = W[3];
Chris@10 241 T4 = T2 * T3;
Chris@10 242 Tb = T5 * T3;
Chris@10 243 T7 = T5 * T6;
Chris@10 244 Ta = T2 * T6;
Chris@10 245 T8 = T4 - T7;
Chris@10 246 Tc = Ta + Tb;
Chris@10 247 Tg = T4 + T7;
Chris@10 248 Ti = Ta - Tb;
Chris@10 249 Tl = W[4];
Chris@10 250 Tm = W[5];
Chris@10 251 Tn = FMA(T2, Tl, T5 * Tm);
Chris@10 252 Tz = FNMS(Ti, Tl, Tg * Tm);
Chris@10 253 Tp = FNMS(T5, Tl, T2 * Tm);
Chris@10 254 Tx = FMA(Tg, Tl, Ti * Tm);
Chris@10 255 }
Chris@10 256 {
Chris@10 257 E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
Chris@10 258 E TT;
Chris@10 259 {
Chris@10 260 E T1, T1c, Te, T1b, T9, Td;
Chris@10 261 T1 = ri[0];
Chris@10 262 T1c = ii[0];
Chris@10 263 T9 = ri[WS(rs, 4)];
Chris@10 264 Td = ii[WS(rs, 4)];
Chris@10 265 Te = FMA(T8, T9, Tc * Td);
Chris@10 266 T1b = FNMS(Tc, T9, T8 * Td);
Chris@10 267 Tf = T1 + Te;
Chris@10 268 T1i = T1c - T1b;
Chris@10 269 TL = T1 - Te;
Chris@10 270 T1d = T1b + T1c;
Chris@10 271 }
Chris@10 272 {
Chris@10 273 E TF, TW, TI, TX;
Chris@10 274 {
Chris@10 275 E TD, TE, TG, TH;
Chris@10 276 TD = ri[WS(rs, 7)];
Chris@10 277 TE = ii[WS(rs, 7)];
Chris@10 278 TF = FMA(Tl, TD, Tm * TE);
Chris@10 279 TW = FNMS(Tm, TD, Tl * TE);
Chris@10 280 TG = ri[WS(rs, 3)];
Chris@10 281 TH = ii[WS(rs, 3)];
Chris@10 282 TI = FMA(T3, TG, T6 * TH);
Chris@10 283 TX = FNMS(T6, TG, T3 * TH);
Chris@10 284 }
Chris@10 285 TJ = TF + TI;
Chris@10 286 T17 = TW + TX;
Chris@10 287 TV = TF - TI;
Chris@10 288 TY = TW - TX;
Chris@10 289 }
Chris@10 290 {
Chris@10 291 E Tk, TM, Tr, TN;
Chris@10 292 {
Chris@10 293 E Th, Tj, To, Tq;
Chris@10 294 Th = ri[WS(rs, 2)];
Chris@10 295 Tj = ii[WS(rs, 2)];
Chris@10 296 Tk = FMA(Tg, Th, Ti * Tj);
Chris@10 297 TM = FNMS(Ti, Th, Tg * Tj);
Chris@10 298 To = ri[WS(rs, 6)];
Chris@10 299 Tq = ii[WS(rs, 6)];
Chris@10 300 Tr = FMA(Tn, To, Tp * Tq);
Chris@10 301 TN = FNMS(Tp, To, Tn * Tq);
Chris@10 302 }
Chris@10 303 Ts = Tk + Tr;
Chris@10 304 T1j = Tk - Tr;
Chris@10 305 TO = TM - TN;
Chris@10 306 T1a = TM + TN;
Chris@10 307 }
Chris@10 308 {
Chris@10 309 E Tw, TR, TB, TS;
Chris@10 310 {
Chris@10 311 E Tu, Tv, Ty, TA;
Chris@10 312 Tu = ri[WS(rs, 1)];
Chris@10 313 Tv = ii[WS(rs, 1)];
Chris@10 314 Tw = FMA(T2, Tu, T5 * Tv);
Chris@10 315 TR = FNMS(T5, Tu, T2 * Tv);
Chris@10 316 Ty = ri[WS(rs, 5)];
Chris@10 317 TA = ii[WS(rs, 5)];
Chris@10 318 TB = FMA(Tx, Ty, Tz * TA);
Chris@10 319 TS = FNMS(Tz, Ty, Tx * TA);
Chris@10 320 }
Chris@10 321 TC = Tw + TB;
Chris@10 322 T16 = TR + TS;
Chris@10 323 TQ = Tw - TB;
Chris@10 324 TT = TR - TS;
Chris@10 325 }
Chris@10 326 {
Chris@10 327 E Tt, TK, T1f, T1g;
Chris@10 328 Tt = Tf + Ts;
Chris@10 329 TK = TC + TJ;
Chris@10 330 ri[WS(rs, 4)] = Tt - TK;
Chris@10 331 ri[0] = Tt + TK;
Chris@10 332 {
Chris@10 333 E T19, T1e, T15, T18;
Chris@10 334 T19 = T16 + T17;
Chris@10 335 T1e = T1a + T1d;
Chris@10 336 ii[0] = T19 + T1e;
Chris@10 337 ii[WS(rs, 4)] = T1e - T19;
Chris@10 338 T15 = Tf - Ts;
Chris@10 339 T18 = T16 - T17;
Chris@10 340 ri[WS(rs, 6)] = T15 - T18;
Chris@10 341 ri[WS(rs, 2)] = T15 + T18;
Chris@10 342 }
Chris@10 343 T1f = TJ - TC;
Chris@10 344 T1g = T1d - T1a;
Chris@10 345 ii[WS(rs, 2)] = T1f + T1g;
Chris@10 346 ii[WS(rs, 6)] = T1g - T1f;
Chris@10 347 {
Chris@10 348 E T11, T1k, T14, T1h, T12, T13;
Chris@10 349 T11 = TL - TO;
Chris@10 350 T1k = T1i - T1j;
Chris@10 351 T12 = TT - TQ;
Chris@10 352 T13 = TV + TY;
Chris@10 353 T14 = KP707106781 * (T12 - T13);
Chris@10 354 T1h = KP707106781 * (T12 + T13);
Chris@10 355 ri[WS(rs, 7)] = T11 - T14;
Chris@10 356 ii[WS(rs, 5)] = T1k - T1h;
Chris@10 357 ri[WS(rs, 3)] = T11 + T14;
Chris@10 358 ii[WS(rs, 1)] = T1h + T1k;
Chris@10 359 }
Chris@10 360 {
Chris@10 361 E TP, T1m, T10, T1l, TU, TZ;
Chris@10 362 TP = TL + TO;
Chris@10 363 T1m = T1j + T1i;
Chris@10 364 TU = TQ + TT;
Chris@10 365 TZ = TV - TY;
Chris@10 366 T10 = KP707106781 * (TU + TZ);
Chris@10 367 T1l = KP707106781 * (TZ - TU);
Chris@10 368 ri[WS(rs, 5)] = TP - T10;
Chris@10 369 ii[WS(rs, 7)] = T1m - T1l;
Chris@10 370 ri[WS(rs, 1)] = TP + T10;
Chris@10 371 ii[WS(rs, 3)] = T1l + T1m;
Chris@10 372 }
Chris@10 373 }
Chris@10 374 }
Chris@10 375 }
Chris@10 376 }
Chris@10 377 }
Chris@10 378
Chris@10 379 static const tw_instr twinstr[] = {
Chris@10 380 {TW_CEXP, 0, 1},
Chris@10 381 {TW_CEXP, 0, 3},
Chris@10 382 {TW_CEXP, 0, 7},
Chris@10 383 {TW_NEXT, 1, 0}
Chris@10 384 };
Chris@10 385
Chris@10 386 static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, {56, 26, 18, 0}, 0, 0, 0 };
Chris@10 387
Chris@10 388 void X(codelet_t2_8) (planner *p) {
Chris@10 389 X(kdft_dit_register) (p, t2_8, &desc);
Chris@10 390 }
Chris@10 391 #endif /* HAVE_FMA */