annotate src/fftw-3.3.5/dft/scalar/codelets/t1_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:36:10 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include t.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@42 33 * 61 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "t.h"
Chris@42 36
Chris@42 37 static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 43 E T1g, T1f, T1e, Tm, T1q, T1o, T1p, TN, T1h, T1i;
Chris@42 44 {
Chris@42 45 E T1, T1m, T1l, T7, TS, Tk, TQ, Te, To, Tr, T17, TM, T12, Tu, TW;
Chris@42 46 E Tp, Tx, Tt, Tq, Tw;
Chris@42 47 {
Chris@42 48 E T3, T6, T2, T5;
Chris@42 49 T1 = ri[0];
Chris@42 50 T1m = ii[0];
Chris@42 51 T3 = ri[WS(rs, 4)];
Chris@42 52 T6 = ii[WS(rs, 4)];
Chris@42 53 T2 = W[6];
Chris@42 54 T5 = W[7];
Chris@42 55 {
Chris@42 56 E Ta, Td, T9, Tc;
Chris@42 57 {
Chris@42 58 E Tg, Tj, Ti, TR, Th, T1k, T4, Tf;
Chris@42 59 Tg = ri[WS(rs, 6)];
Chris@42 60 Tj = ii[WS(rs, 6)];
Chris@42 61 T1k = T2 * T6;
Chris@42 62 T4 = T2 * T3;
Chris@42 63 Tf = W[10];
Chris@42 64 Ti = W[11];
Chris@42 65 T1l = FNMS(T5, T3, T1k);
Chris@42 66 T7 = FMA(T5, T6, T4);
Chris@42 67 TR = Tf * Tj;
Chris@42 68 Th = Tf * Tg;
Chris@42 69 Ta = ri[WS(rs, 2)];
Chris@42 70 Td = ii[WS(rs, 2)];
Chris@42 71 TS = FNMS(Ti, Tg, TR);
Chris@42 72 Tk = FMA(Ti, Tj, Th);
Chris@42 73 T9 = W[2];
Chris@42 74 Tc = W[3];
Chris@42 75 }
Chris@42 76 {
Chris@42 77 E TB, TE, TH, T13, TC, TK, TG, TD, TJ, TP, Tb, TA, Tn;
Chris@42 78 TB = ri[WS(rs, 7)];
Chris@42 79 TE = ii[WS(rs, 7)];
Chris@42 80 TP = T9 * Td;
Chris@42 81 Tb = T9 * Ta;
Chris@42 82 TA = W[12];
Chris@42 83 TH = ri[WS(rs, 3)];
Chris@42 84 TQ = FNMS(Tc, Ta, TP);
Chris@42 85 Te = FMA(Tc, Td, Tb);
Chris@42 86 T13 = TA * TE;
Chris@42 87 TC = TA * TB;
Chris@42 88 TK = ii[WS(rs, 3)];
Chris@42 89 TG = W[4];
Chris@42 90 TD = W[13];
Chris@42 91 TJ = W[5];
Chris@42 92 {
Chris@42 93 E T14, TF, T16, TL, T15, TI;
Chris@42 94 To = ri[WS(rs, 1)];
Chris@42 95 T15 = TG * TK;
Chris@42 96 TI = TG * TH;
Chris@42 97 T14 = FNMS(TD, TB, T13);
Chris@42 98 TF = FMA(TD, TE, TC);
Chris@42 99 T16 = FNMS(TJ, TH, T15);
Chris@42 100 TL = FMA(TJ, TK, TI);
Chris@42 101 Tr = ii[WS(rs, 1)];
Chris@42 102 Tn = W[0];
Chris@42 103 T17 = T14 - T16;
Chris@42 104 T1g = T14 + T16;
Chris@42 105 TM = TF + TL;
Chris@42 106 T12 = TF - TL;
Chris@42 107 }
Chris@42 108 Tu = ri[WS(rs, 5)];
Chris@42 109 TW = Tn * Tr;
Chris@42 110 Tp = Tn * To;
Chris@42 111 Tx = ii[WS(rs, 5)];
Chris@42 112 Tt = W[8];
Chris@42 113 Tq = W[1];
Chris@42 114 Tw = W[9];
Chris@42 115 }
Chris@42 116 }
Chris@42 117 }
Chris@42 118 {
Chris@42 119 E T8, T1j, T1n, Tz, T1a, TU, Tl, T1b, T1c, T1v, T1t, T1w, T19, T1u, T1d;
Chris@42 120 {
Chris@42 121 E T1r, T10, TV, T1s, T11, T18;
Chris@42 122 {
Chris@42 123 E TO, TX, Ts, TZ, Ty, TT, TY, Tv;
Chris@42 124 T8 = T1 + T7;
Chris@42 125 TO = T1 - T7;
Chris@42 126 TY = Tt * Tx;
Chris@42 127 Tv = Tt * Tu;
Chris@42 128 TX = FNMS(Tq, To, TW);
Chris@42 129 Ts = FMA(Tq, Tr, Tp);
Chris@42 130 TZ = FNMS(Tw, Tu, TY);
Chris@42 131 Ty = FMA(Tw, Tx, Tv);
Chris@42 132 TT = TQ - TS;
Chris@42 133 T1j = TQ + TS;
Chris@42 134 T1n = T1l + T1m;
Chris@42 135 T1r = T1m - T1l;
Chris@42 136 T10 = TX - TZ;
Chris@42 137 T1f = TX + TZ;
Chris@42 138 Tz = Ts + Ty;
Chris@42 139 TV = Ts - Ty;
Chris@42 140 T1a = TO - TT;
Chris@42 141 TU = TO + TT;
Chris@42 142 T1s = Te - Tk;
Chris@42 143 Tl = Te + Tk;
Chris@42 144 }
Chris@42 145 T1b = T10 - TV;
Chris@42 146 T11 = TV + T10;
Chris@42 147 T18 = T12 - T17;
Chris@42 148 T1c = T12 + T17;
Chris@42 149 T1v = T1s + T1r;
Chris@42 150 T1t = T1r - T1s;
Chris@42 151 T1w = T18 - T11;
Chris@42 152 T19 = T11 + T18;
Chris@42 153 }
Chris@42 154 ii[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
Chris@42 155 ii[WS(rs, 7)] = FNMS(KP707106781, T1w, T1v);
Chris@42 156 ri[WS(rs, 1)] = FMA(KP707106781, T19, TU);
Chris@42 157 ri[WS(rs, 5)] = FNMS(KP707106781, T19, TU);
Chris@42 158 T1u = T1b + T1c;
Chris@42 159 T1d = T1b - T1c;
Chris@42 160 ii[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
Chris@42 161 ii[WS(rs, 5)] = FNMS(KP707106781, T1u, T1t);
Chris@42 162 ri[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
Chris@42 163 ri[WS(rs, 7)] = FNMS(KP707106781, T1d, T1a);
Chris@42 164 T1e = T8 - Tl;
Chris@42 165 Tm = T8 + Tl;
Chris@42 166 T1q = T1n - T1j;
Chris@42 167 T1o = T1j + T1n;
Chris@42 168 T1p = TM - Tz;
Chris@42 169 TN = Tz + TM;
Chris@42 170 }
Chris@42 171 }
Chris@42 172 ii[WS(rs, 2)] = T1p + T1q;
Chris@42 173 ii[WS(rs, 6)] = T1q - T1p;
Chris@42 174 ri[0] = Tm + TN;
Chris@42 175 ri[WS(rs, 4)] = Tm - TN;
Chris@42 176 T1h = T1f - T1g;
Chris@42 177 T1i = T1f + T1g;
Chris@42 178 ii[0] = T1i + T1o;
Chris@42 179 ii[WS(rs, 4)] = T1o - T1i;
Chris@42 180 ri[WS(rs, 2)] = T1e + T1h;
Chris@42 181 ri[WS(rs, 6)] = T1e - T1h;
Chris@42 182 }
Chris@42 183 }
Chris@42 184 }
Chris@42 185
Chris@42 186 static const tw_instr twinstr[] = {
Chris@42 187 {TW_FULL, 0, 8},
Chris@42 188 {TW_NEXT, 1, 0}
Chris@42 189 };
Chris@42 190
Chris@42 191 static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, {44, 14, 22, 0}, 0, 0, 0 };
Chris@42 192
Chris@42 193 void X(codelet_t1_8) (planner *p) {
Chris@42 194 X(kdft_dit_register) (p, t1_8, &desc);
Chris@42 195 }
Chris@42 196 #else /* HAVE_FMA */
Chris@42 197
Chris@42 198 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include t.h */
Chris@42 199
Chris@42 200 /*
Chris@42 201 * This function contains 66 FP additions, 32 FP multiplications,
Chris@42 202 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@42 203 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@42 204 */
Chris@42 205 #include "t.h"
Chris@42 206
Chris@42 207 static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 208 {
Chris@42 209 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 210 {
Chris@42 211 INT m;
Chris@42 212 for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 213 E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
Chris@42 214 E TP;
Chris@42 215 {
Chris@42 216 E T1, T18, T6, T17;
Chris@42 217 T1 = ri[0];
Chris@42 218 T18 = ii[0];
Chris@42 219 {
Chris@42 220 E T3, T5, T2, T4;
Chris@42 221 T3 = ri[WS(rs, 4)];
Chris@42 222 T5 = ii[WS(rs, 4)];
Chris@42 223 T2 = W[6];
Chris@42 224 T4 = W[7];
Chris@42 225 T6 = FMA(T2, T3, T4 * T5);
Chris@42 226 T17 = FNMS(T4, T3, T2 * T5);
Chris@42 227 }
Chris@42 228 T7 = T1 + T6;
Chris@42 229 T1e = T18 - T17;
Chris@42 230 TH = T1 - T6;
Chris@42 231 T19 = T17 + T18;
Chris@42 232 }
Chris@42 233 {
Chris@42 234 E Tz, TS, TE, TT;
Chris@42 235 {
Chris@42 236 E Tw, Ty, Tv, Tx;
Chris@42 237 Tw = ri[WS(rs, 7)];
Chris@42 238 Ty = ii[WS(rs, 7)];
Chris@42 239 Tv = W[12];
Chris@42 240 Tx = W[13];
Chris@42 241 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 242 TS = FNMS(Tx, Tw, Tv * Ty);
Chris@42 243 }
Chris@42 244 {
Chris@42 245 E TB, TD, TA, TC;
Chris@42 246 TB = ri[WS(rs, 3)];
Chris@42 247 TD = ii[WS(rs, 3)];
Chris@42 248 TA = W[4];
Chris@42 249 TC = W[5];
Chris@42 250 TE = FMA(TA, TB, TC * TD);
Chris@42 251 TT = FNMS(TC, TB, TA * TD);
Chris@42 252 }
Chris@42 253 TF = Tz + TE;
Chris@42 254 T13 = TS + TT;
Chris@42 255 TR = Tz - TE;
Chris@42 256 TU = TS - TT;
Chris@42 257 }
Chris@42 258 {
Chris@42 259 E Tc, TI, Th, TJ;
Chris@42 260 {
Chris@42 261 E T9, Tb, T8, Ta;
Chris@42 262 T9 = ri[WS(rs, 2)];
Chris@42 263 Tb = ii[WS(rs, 2)];
Chris@42 264 T8 = W[2];
Chris@42 265 Ta = W[3];
Chris@42 266 Tc = FMA(T8, T9, Ta * Tb);
Chris@42 267 TI = FNMS(Ta, T9, T8 * Tb);
Chris@42 268 }
Chris@42 269 {
Chris@42 270 E Te, Tg, Td, Tf;
Chris@42 271 Te = ri[WS(rs, 6)];
Chris@42 272 Tg = ii[WS(rs, 6)];
Chris@42 273 Td = W[10];
Chris@42 274 Tf = W[11];
Chris@42 275 Th = FMA(Td, Te, Tf * Tg);
Chris@42 276 TJ = FNMS(Tf, Te, Td * Tg);
Chris@42 277 }
Chris@42 278 Ti = Tc + Th;
Chris@42 279 T1f = Tc - Th;
Chris@42 280 TK = TI - TJ;
Chris@42 281 T16 = TI + TJ;
Chris@42 282 }
Chris@42 283 {
Chris@42 284 E To, TN, Tt, TO;
Chris@42 285 {
Chris@42 286 E Tl, Tn, Tk, Tm;
Chris@42 287 Tl = ri[WS(rs, 1)];
Chris@42 288 Tn = ii[WS(rs, 1)];
Chris@42 289 Tk = W[0];
Chris@42 290 Tm = W[1];
Chris@42 291 To = FMA(Tk, Tl, Tm * Tn);
Chris@42 292 TN = FNMS(Tm, Tl, Tk * Tn);
Chris@42 293 }
Chris@42 294 {
Chris@42 295 E Tq, Ts, Tp, Tr;
Chris@42 296 Tq = ri[WS(rs, 5)];
Chris@42 297 Ts = ii[WS(rs, 5)];
Chris@42 298 Tp = W[8];
Chris@42 299 Tr = W[9];
Chris@42 300 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@42 301 TO = FNMS(Tr, Tq, Tp * Ts);
Chris@42 302 }
Chris@42 303 Tu = To + Tt;
Chris@42 304 T12 = TN + TO;
Chris@42 305 TM = To - Tt;
Chris@42 306 TP = TN - TO;
Chris@42 307 }
Chris@42 308 {
Chris@42 309 E Tj, TG, T1b, T1c;
Chris@42 310 Tj = T7 + Ti;
Chris@42 311 TG = Tu + TF;
Chris@42 312 ri[WS(rs, 4)] = Tj - TG;
Chris@42 313 ri[0] = Tj + TG;
Chris@42 314 {
Chris@42 315 E T15, T1a, T11, T14;
Chris@42 316 T15 = T12 + T13;
Chris@42 317 T1a = T16 + T19;
Chris@42 318 ii[0] = T15 + T1a;
Chris@42 319 ii[WS(rs, 4)] = T1a - T15;
Chris@42 320 T11 = T7 - Ti;
Chris@42 321 T14 = T12 - T13;
Chris@42 322 ri[WS(rs, 6)] = T11 - T14;
Chris@42 323 ri[WS(rs, 2)] = T11 + T14;
Chris@42 324 }
Chris@42 325 T1b = TF - Tu;
Chris@42 326 T1c = T19 - T16;
Chris@42 327 ii[WS(rs, 2)] = T1b + T1c;
Chris@42 328 ii[WS(rs, 6)] = T1c - T1b;
Chris@42 329 {
Chris@42 330 E TX, T1g, T10, T1d, TY, TZ;
Chris@42 331 TX = TH - TK;
Chris@42 332 T1g = T1e - T1f;
Chris@42 333 TY = TP - TM;
Chris@42 334 TZ = TR + TU;
Chris@42 335 T10 = KP707106781 * (TY - TZ);
Chris@42 336 T1d = KP707106781 * (TY + TZ);
Chris@42 337 ri[WS(rs, 7)] = TX - T10;
Chris@42 338 ii[WS(rs, 5)] = T1g - T1d;
Chris@42 339 ri[WS(rs, 3)] = TX + T10;
Chris@42 340 ii[WS(rs, 1)] = T1d + T1g;
Chris@42 341 }
Chris@42 342 {
Chris@42 343 E TL, T1i, TW, T1h, TQ, TV;
Chris@42 344 TL = TH + TK;
Chris@42 345 T1i = T1f + T1e;
Chris@42 346 TQ = TM + TP;
Chris@42 347 TV = TR - TU;
Chris@42 348 TW = KP707106781 * (TQ + TV);
Chris@42 349 T1h = KP707106781 * (TV - TQ);
Chris@42 350 ri[WS(rs, 5)] = TL - TW;
Chris@42 351 ii[WS(rs, 7)] = T1i - T1h;
Chris@42 352 ri[WS(rs, 1)] = TL + TW;
Chris@42 353 ii[WS(rs, 3)] = T1h + T1i;
Chris@42 354 }
Chris@42 355 }
Chris@42 356 }
Chris@42 357 }
Chris@42 358 }
Chris@42 359
Chris@42 360 static const tw_instr twinstr[] = {
Chris@42 361 {TW_FULL, 0, 8},
Chris@42 362 {TW_NEXT, 1, 0}
Chris@42 363 };
Chris@42 364
Chris@42 365 static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, {52, 18, 14, 0}, 0, 0, 0 };
Chris@42 366
Chris@42 367 void X(codelet_t1_8) (planner *p) {
Chris@42 368 X(kdft_dit_register) (p, t1_8, &desc);
Chris@42 369 }
Chris@42 370 #endif /* HAVE_FMA */