annotate src/fftw-3.3.3/dft/scalar/codelets/t1_7.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:48 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include t.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 72 FP additions, 66 FP multiplications,
Chris@10 32 * (or, 18 additions, 12 multiplications, 54 fused multiply/add),
Chris@10 33 * 66 stack variables, 6 constants, and 28 memory accesses
Chris@10 34 */
Chris@10 35 #include "t.h"
Chris@10 36
Chris@10 37 static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@10 40 DK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@10 41 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@10 42 DK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@10 43 DK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@10 44 DK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@10 45 {
Chris@10 46 INT m;
Chris@10 47 for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@10 48 E T1c, T19, T1i, T18, T16, T1q, T1t, T1r, T1u, T1s;
Chris@10 49 {
Chris@10 50 E T1, TR, T1h, Te, Tt, Tw, T1a, TM, T1g, Tr, Tu, TS, Tz, TC, Ty;
Chris@10 51 E Tv, TB;
Chris@10 52 T1 = ri[0];
Chris@10 53 T1c = ii[0];
Chris@10 54 {
Chris@10 55 E T9, Tc, TP, Ta, Tb, TO, T7;
Chris@10 56 {
Chris@10 57 E T3, T6, T8, TN, T4, T2, T5;
Chris@10 58 T3 = ri[WS(rs, 1)];
Chris@10 59 T6 = ii[WS(rs, 1)];
Chris@10 60 T2 = W[0];
Chris@10 61 T9 = ri[WS(rs, 6)];
Chris@10 62 Tc = ii[WS(rs, 6)];
Chris@10 63 T8 = W[10];
Chris@10 64 TN = T2 * T6;
Chris@10 65 T4 = T2 * T3;
Chris@10 66 T5 = W[1];
Chris@10 67 TP = T8 * Tc;
Chris@10 68 Ta = T8 * T9;
Chris@10 69 Tb = W[11];
Chris@10 70 TO = FNMS(T5, T3, TN);
Chris@10 71 T7 = FMA(T5, T6, T4);
Chris@10 72 }
Chris@10 73 {
Chris@10 74 E Tg, Tj, Th, TI, Tm, Tp, Tl, Ti, To, TQ, Td, Tf;
Chris@10 75 Tg = ri[WS(rs, 2)];
Chris@10 76 TQ = FNMS(Tb, T9, TP);
Chris@10 77 Td = FMA(Tb, Tc, Ta);
Chris@10 78 Tj = ii[WS(rs, 2)];
Chris@10 79 Tf = W[2];
Chris@10 80 T19 = TO + TQ;
Chris@10 81 TR = TO - TQ;
Chris@10 82 T1h = Td - T7;
Chris@10 83 Te = T7 + Td;
Chris@10 84 Th = Tf * Tg;
Chris@10 85 TI = Tf * Tj;
Chris@10 86 Tm = ri[WS(rs, 5)];
Chris@10 87 Tp = ii[WS(rs, 5)];
Chris@10 88 Tl = W[8];
Chris@10 89 Ti = W[3];
Chris@10 90 To = W[9];
Chris@10 91 {
Chris@10 92 E TJ, Tk, TL, Tq, TK, Tn, Ts;
Chris@10 93 Tt = ri[WS(rs, 3)];
Chris@10 94 TK = Tl * Tp;
Chris@10 95 Tn = Tl * Tm;
Chris@10 96 TJ = FNMS(Ti, Tg, TI);
Chris@10 97 Tk = FMA(Ti, Tj, Th);
Chris@10 98 TL = FNMS(To, Tm, TK);
Chris@10 99 Tq = FMA(To, Tp, Tn);
Chris@10 100 Tw = ii[WS(rs, 3)];
Chris@10 101 Ts = W[4];
Chris@10 102 T1a = TJ + TL;
Chris@10 103 TM = TJ - TL;
Chris@10 104 T1g = Tq - Tk;
Chris@10 105 Tr = Tk + Tq;
Chris@10 106 Tu = Ts * Tt;
Chris@10 107 TS = Ts * Tw;
Chris@10 108 }
Chris@10 109 Tz = ri[WS(rs, 4)];
Chris@10 110 TC = ii[WS(rs, 4)];
Chris@10 111 Ty = W[6];
Chris@10 112 Tv = W[5];
Chris@10 113 TB = W[7];
Chris@10 114 }
Chris@10 115 }
Chris@10 116 {
Chris@10 117 E TF, TT, Tx, TV, TD, T1d, TU, TA;
Chris@10 118 TF = FNMS(KP356895867, Tr, Te);
Chris@10 119 TU = Ty * TC;
Chris@10 120 TA = Ty * Tz;
Chris@10 121 TT = FNMS(Tv, Tt, TS);
Chris@10 122 Tx = FMA(Tv, Tw, Tu);
Chris@10 123 TV = FNMS(TB, Tz, TU);
Chris@10 124 TD = FMA(TB, TC, TA);
Chris@10 125 T1d = FNMS(KP356895867, T1a, T19);
Chris@10 126 {
Chris@10 127 E T1b, T15, T17, TW;
Chris@10 128 T17 = FNMS(KP554958132, TR, TM);
Chris@10 129 T1b = TT + TV;
Chris@10 130 TW = TT - TV;
Chris@10 131 {
Chris@10 132 E TE, T1l, T1e, T12;
Chris@10 133 T1i = TD - Tx;
Chris@10 134 TE = Tx + TD;
Chris@10 135 T1l = FNMS(KP356895867, T19, T1b);
Chris@10 136 T1e = FNMS(KP692021471, T1d, T1b);
Chris@10 137 ii[0] = T19 + T1a + T1b + T1c;
Chris@10 138 T12 = FMA(KP554958132, TM, TW);
Chris@10 139 {
Chris@10 140 E TX, T1o, T1j, T14;
Chris@10 141 TX = FMA(KP554958132, TW, TR);
Chris@10 142 T1o = FMA(KP554958132, T1g, T1i);
Chris@10 143 T1j = FMA(KP554958132, T1i, T1h);
Chris@10 144 T14 = FNMS(KP356895867, TE, Tr);
Chris@10 145 {
Chris@10 146 E TZ, TG, T1m, T1f;
Chris@10 147 TZ = FNMS(KP356895867, Te, TE);
Chris@10 148 TG = FNMS(KP692021471, TF, TE);
Chris@10 149 ri[0] = T1 + Te + Tr + TE;
Chris@10 150 T1m = FNMS(KP692021471, T1l, T1a);
Chris@10 151 T1f = FNMS(KP900968867, T1e, T1c);
Chris@10 152 {
Chris@10 153 E T13, TY, T1p, T1k;
Chris@10 154 T13 = FNMS(KP801937735, T12, TR);
Chris@10 155 TY = FMA(KP801937735, TX, TM);
Chris@10 156 T1p = FNMS(KP801937735, T1o, T1h);
Chris@10 157 T1k = FMA(KP801937735, T1j, T1g);
Chris@10 158 T15 = FNMS(KP692021471, T14, Te);
Chris@10 159 {
Chris@10 160 E T10, TH, T1n, T11;
Chris@10 161 T10 = FNMS(KP692021471, TZ, Tr);
Chris@10 162 TH = FNMS(KP900968867, TG, T1);
Chris@10 163 T1n = FNMS(KP900968867, T1m, T1c);
Chris@10 164 ii[WS(rs, 6)] = FNMS(KP974927912, T1k, T1f);
Chris@10 165 ii[WS(rs, 1)] = FMA(KP974927912, T1k, T1f);
Chris@10 166 T11 = FNMS(KP900968867, T10, T1);
Chris@10 167 ri[WS(rs, 1)] = FMA(KP974927912, TY, TH);
Chris@10 168 ri[WS(rs, 6)] = FNMS(KP974927912, TY, TH);
Chris@10 169 ii[WS(rs, 5)] = FNMS(KP974927912, T1p, T1n);
Chris@10 170 ii[WS(rs, 2)] = FMA(KP974927912, T1p, T1n);
Chris@10 171 ri[WS(rs, 2)] = FMA(KP974927912, T13, T11);
Chris@10 172 ri[WS(rs, 5)] = FNMS(KP974927912, T13, T11);
Chris@10 173 T18 = FNMS(KP801937735, T17, TW);
Chris@10 174 }
Chris@10 175 }
Chris@10 176 }
Chris@10 177 }
Chris@10 178 }
Chris@10 179 T16 = FNMS(KP900968867, T15, T1);
Chris@10 180 T1q = FNMS(KP356895867, T1b, T1a);
Chris@10 181 T1t = FNMS(KP554958132, T1h, T1g);
Chris@10 182 }
Chris@10 183 }
Chris@10 184 }
Chris@10 185 ri[WS(rs, 3)] = FMA(KP974927912, T18, T16);
Chris@10 186 ri[WS(rs, 4)] = FNMS(KP974927912, T18, T16);
Chris@10 187 T1r = FNMS(KP692021471, T1q, T19);
Chris@10 188 T1u = FNMS(KP801937735, T1t, T1i);
Chris@10 189 T1s = FNMS(KP900968867, T1r, T1c);
Chris@10 190 ii[WS(rs, 4)] = FNMS(KP974927912, T1u, T1s);
Chris@10 191 ii[WS(rs, 3)] = FMA(KP974927912, T1u, T1s);
Chris@10 192 }
Chris@10 193 }
Chris@10 194 }
Chris@10 195
Chris@10 196 static const tw_instr twinstr[] = {
Chris@10 197 {TW_FULL, 0, 7},
Chris@10 198 {TW_NEXT, 1, 0}
Chris@10 199 };
Chris@10 200
Chris@10 201 static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, {18, 12, 54, 0}, 0, 0, 0 };
Chris@10 202
Chris@10 203 void X(codelet_t1_7) (planner *p) {
Chris@10 204 X(kdft_dit_register) (p, t1_7, &desc);
Chris@10 205 }
Chris@10 206 #else /* HAVE_FMA */
Chris@10 207
Chris@10 208 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include t.h */
Chris@10 209
Chris@10 210 /*
Chris@10 211 * This function contains 72 FP additions, 60 FP multiplications,
Chris@10 212 * (or, 36 additions, 24 multiplications, 36 fused multiply/add),
Chris@10 213 * 29 stack variables, 6 constants, and 28 memory accesses
Chris@10 214 */
Chris@10 215 #include "t.h"
Chris@10 216
Chris@10 217 static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 218 {
Chris@10 219 DK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@10 220 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@10 221 DK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@10 222 DK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@10 223 DK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@10 224 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@10 225 {
Chris@10 226 INT m;
Chris@10 227 for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@10 228 E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ;
Chris@10 229 T1 = ri[0];
Chris@10 230 TR = ii[0];
Chris@10 231 {
Chris@10 232 E T6, TA, Tb, TB;
Chris@10 233 {
Chris@10 234 E T3, T5, T2, T4;
Chris@10 235 T3 = ri[WS(rs, 1)];
Chris@10 236 T5 = ii[WS(rs, 1)];
Chris@10 237 T2 = W[0];
Chris@10 238 T4 = W[1];
Chris@10 239 T6 = FMA(T2, T3, T4 * T5);
Chris@10 240 TA = FNMS(T4, T3, T2 * T5);
Chris@10 241 }
Chris@10 242 {
Chris@10 243 E T8, Ta, T7, T9;
Chris@10 244 T8 = ri[WS(rs, 6)];
Chris@10 245 Ta = ii[WS(rs, 6)];
Chris@10 246 T7 = W[10];
Chris@10 247 T9 = W[11];
Chris@10 248 Tb = FMA(T7, T8, T9 * Ta);
Chris@10 249 TB = FNMS(T9, T8, T7 * Ta);
Chris@10 250 }
Chris@10 251 Tc = T6 + Tb;
Chris@10 252 TS = Tb - T6;
Chris@10 253 TC = TA - TB;
Chris@10 254 TO = TA + TB;
Chris@10 255 }
Chris@10 256 {
Chris@10 257 E Th, TG, Tm, TH;
Chris@10 258 {
Chris@10 259 E Te, Tg, Td, Tf;
Chris@10 260 Te = ri[WS(rs, 2)];
Chris@10 261 Tg = ii[WS(rs, 2)];
Chris@10 262 Td = W[2];
Chris@10 263 Tf = W[3];
Chris@10 264 Th = FMA(Td, Te, Tf * Tg);
Chris@10 265 TG = FNMS(Tf, Te, Td * Tg);
Chris@10 266 }
Chris@10 267 {
Chris@10 268 E Tj, Tl, Ti, Tk;
Chris@10 269 Tj = ri[WS(rs, 5)];
Chris@10 270 Tl = ii[WS(rs, 5)];
Chris@10 271 Ti = W[8];
Chris@10 272 Tk = W[9];
Chris@10 273 Tm = FMA(Ti, Tj, Tk * Tl);
Chris@10 274 TH = FNMS(Tk, Tj, Ti * Tl);
Chris@10 275 }
Chris@10 276 Tn = Th + Tm;
Chris@10 277 TT = Tm - Th;
Chris@10 278 TI = TG - TH;
Chris@10 279 TP = TG + TH;
Chris@10 280 }
Chris@10 281 {
Chris@10 282 E Ts, TD, Tx, TE;
Chris@10 283 {
Chris@10 284 E Tp, Tr, To, Tq;
Chris@10 285 Tp = ri[WS(rs, 3)];
Chris@10 286 Tr = ii[WS(rs, 3)];
Chris@10 287 To = W[4];
Chris@10 288 Tq = W[5];
Chris@10 289 Ts = FMA(To, Tp, Tq * Tr);
Chris@10 290 TD = FNMS(Tq, Tp, To * Tr);
Chris@10 291 }
Chris@10 292 {
Chris@10 293 E Tu, Tw, Tt, Tv;
Chris@10 294 Tu = ri[WS(rs, 4)];
Chris@10 295 Tw = ii[WS(rs, 4)];
Chris@10 296 Tt = W[6];
Chris@10 297 Tv = W[7];
Chris@10 298 Tx = FMA(Tt, Tu, Tv * Tw);
Chris@10 299 TE = FNMS(Tv, Tu, Tt * Tw);
Chris@10 300 }
Chris@10 301 Ty = Ts + Tx;
Chris@10 302 TU = Tx - Ts;
Chris@10 303 TF = TD - TE;
Chris@10 304 TQ = TD + TE;
Chris@10 305 }
Chris@10 306 ri[0] = T1 + Tc + Tn + Ty;
Chris@10 307 ii[0] = TO + TP + TQ + TR;
Chris@10 308 {
Chris@10 309 E TJ, Tz, TX, TY;
Chris@10 310 TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
Chris@10 311 Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
Chris@10 312 ri[WS(rs, 5)] = Tz - TJ;
Chris@10 313 ri[WS(rs, 2)] = Tz + TJ;
Chris@10 314 TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT);
Chris@10 315 TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO);
Chris@10 316 ii[WS(rs, 2)] = TX + TY;
Chris@10 317 ii[WS(rs, 5)] = TY - TX;
Chris@10 318 }
Chris@10 319 {
Chris@10 320 E TL, TK, TV, TW;
Chris@10 321 TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
Chris@10 322 TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
Chris@10 323 ri[WS(rs, 6)] = TK - TL;
Chris@10 324 ri[WS(rs, 1)] = TK + TL;
Chris@10 325 TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU);
Chris@10 326 TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP);
Chris@10 327 ii[WS(rs, 1)] = TV + TW;
Chris@10 328 ii[WS(rs, 6)] = TW - TV;
Chris@10 329 }
Chris@10 330 {
Chris@10 331 E TN, TM, TZ, T10;
Chris@10 332 TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
Chris@10 333 TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
Chris@10 334 ri[WS(rs, 4)] = TM - TN;
Chris@10 335 ri[WS(rs, 3)] = TM + TN;
Chris@10 336 TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT);
Chris@10 337 T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO);
Chris@10 338 ii[WS(rs, 3)] = TZ + T10;
Chris@10 339 ii[WS(rs, 4)] = T10 - TZ;
Chris@10 340 }
Chris@10 341 }
Chris@10 342 }
Chris@10 343 }
Chris@10 344
Chris@10 345 static const tw_instr twinstr[] = {
Chris@10 346 {TW_FULL, 0, 7},
Chris@10 347 {TW_NEXT, 1, 0}
Chris@10 348 };
Chris@10 349
Chris@10 350 static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, {36, 24, 36, 0}, 0, 0, 0 };
Chris@10 351
Chris@10 352 void X(codelet_t1_7) (planner *p) {
Chris@10 353 X(kdft_dit_register) (p, t1_7, &desc);
Chris@10 354 }
Chris@10 355 #endif /* HAVE_FMA */