annotate src/fftw-3.3.8/dft/scalar/codelets/t1_7.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:13 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 72 FP additions, 66 FP multiplications,
Chris@82 32 * (or, 18 additions, 12 multiplications, 54 fused multiply/add),
Chris@82 33 * 37 stack variables, 6 constants, and 28 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/t.h"
Chris@82 36
Chris@82 37 static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@82 40 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@82 41 DK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@82 42 DK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@82 43 DK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@82 44 DK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@82 45 {
Chris@82 46 INT m;
Chris@82 47 for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@82 48 E T1, T1c, Te, T1h, TR, T19, Tr, T1g, TM, T1a, TE, T1i, TW, T1b;
Chris@82 49 T1 = ri[0];
Chris@82 50 T1c = ii[0];
Chris@82 51 {
Chris@82 52 E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8;
Chris@82 53 T3 = ri[WS(rs, 1)];
Chris@82 54 T6 = ii[WS(rs, 1)];
Chris@82 55 T2 = W[0];
Chris@82 56 T4 = T2 * T3;
Chris@82 57 TN = T2 * T6;
Chris@82 58 T9 = ri[WS(rs, 6)];
Chris@82 59 Tc = ii[WS(rs, 6)];
Chris@82 60 T8 = W[10];
Chris@82 61 Ta = T8 * T9;
Chris@82 62 TP = T8 * Tc;
Chris@82 63 {
Chris@82 64 E T7, TO, Td, TQ, T5, Tb;
Chris@82 65 T5 = W[1];
Chris@82 66 T7 = FMA(T5, T6, T4);
Chris@82 67 TO = FNMS(T5, T3, TN);
Chris@82 68 Tb = W[11];
Chris@82 69 Td = FMA(Tb, Tc, Ta);
Chris@82 70 TQ = FNMS(Tb, T9, TP);
Chris@82 71 Te = T7 + Td;
Chris@82 72 T1h = Td - T7;
Chris@82 73 TR = TO - TQ;
Chris@82 74 T19 = TO + TQ;
Chris@82 75 }
Chris@82 76 }
Chris@82 77 {
Chris@82 78 E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl;
Chris@82 79 Tg = ri[WS(rs, 2)];
Chris@82 80 Tj = ii[WS(rs, 2)];
Chris@82 81 Tf = W[2];
Chris@82 82 Th = Tf * Tg;
Chris@82 83 TI = Tf * Tj;
Chris@82 84 Tm = ri[WS(rs, 5)];
Chris@82 85 Tp = ii[WS(rs, 5)];
Chris@82 86 Tl = W[8];
Chris@82 87 Tn = Tl * Tm;
Chris@82 88 TK = Tl * Tp;
Chris@82 89 {
Chris@82 90 E Tk, TJ, Tq, TL, Ti, To;
Chris@82 91 Ti = W[3];
Chris@82 92 Tk = FMA(Ti, Tj, Th);
Chris@82 93 TJ = FNMS(Ti, Tg, TI);
Chris@82 94 To = W[9];
Chris@82 95 Tq = FMA(To, Tp, Tn);
Chris@82 96 TL = FNMS(To, Tm, TK);
Chris@82 97 Tr = Tk + Tq;
Chris@82 98 T1g = Tq - Tk;
Chris@82 99 TM = TJ - TL;
Chris@82 100 T1a = TJ + TL;
Chris@82 101 }
Chris@82 102 }
Chris@82 103 {
Chris@82 104 E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty;
Chris@82 105 Tt = ri[WS(rs, 3)];
Chris@82 106 Tw = ii[WS(rs, 3)];
Chris@82 107 Ts = W[4];
Chris@82 108 Tu = Ts * Tt;
Chris@82 109 TS = Ts * Tw;
Chris@82 110 Tz = ri[WS(rs, 4)];
Chris@82 111 TC = ii[WS(rs, 4)];
Chris@82 112 Ty = W[6];
Chris@82 113 TA = Ty * Tz;
Chris@82 114 TU = Ty * TC;
Chris@82 115 {
Chris@82 116 E Tx, TT, TD, TV, Tv, TB;
Chris@82 117 Tv = W[5];
Chris@82 118 Tx = FMA(Tv, Tw, Tu);
Chris@82 119 TT = FNMS(Tv, Tt, TS);
Chris@82 120 TB = W[7];
Chris@82 121 TD = FMA(TB, TC, TA);
Chris@82 122 TV = FNMS(TB, Tz, TU);
Chris@82 123 TE = Tx + TD;
Chris@82 124 T1i = TD - Tx;
Chris@82 125 TW = TT - TV;
Chris@82 126 T1b = TT + TV;
Chris@82 127 }
Chris@82 128 }
Chris@82 129 ri[0] = T1 + Te + Tr + TE;
Chris@82 130 ii[0] = T19 + T1a + T1b + T1c;
Chris@82 131 {
Chris@82 132 E TG, TY, TF, TX, TH;
Chris@82 133 TF = FNMS(KP356895867, Tr, Te);
Chris@82 134 TG = FNMS(KP692021471, TF, TE);
Chris@82 135 TX = FMA(KP554958132, TW, TR);
Chris@82 136 TY = FMA(KP801937735, TX, TM);
Chris@82 137 TH = FNMS(KP900968867, TG, T1);
Chris@82 138 ri[WS(rs, 6)] = FNMS(KP974927912, TY, TH);
Chris@82 139 ri[WS(rs, 1)] = FMA(KP974927912, TY, TH);
Chris@82 140 }
Chris@82 141 {
Chris@82 142 E T1e, T1k, T1d, T1j, T1f;
Chris@82 143 T1d = FNMS(KP356895867, T1a, T19);
Chris@82 144 T1e = FNMS(KP692021471, T1d, T1b);
Chris@82 145 T1j = FMA(KP554958132, T1i, T1h);
Chris@82 146 T1k = FMA(KP801937735, T1j, T1g);
Chris@82 147 T1f = FNMS(KP900968867, T1e, T1c);
Chris@82 148 ii[WS(rs, 1)] = FMA(KP974927912, T1k, T1f);
Chris@82 149 ii[WS(rs, 6)] = FNMS(KP974927912, T1k, T1f);
Chris@82 150 }
Chris@82 151 {
Chris@82 152 E T10, T13, TZ, T12, T11;
Chris@82 153 TZ = FNMS(KP356895867, Te, TE);
Chris@82 154 T10 = FNMS(KP692021471, TZ, Tr);
Chris@82 155 T12 = FMA(KP554958132, TM, TW);
Chris@82 156 T13 = FNMS(KP801937735, T12, TR);
Chris@82 157 T11 = FNMS(KP900968867, T10, T1);
Chris@82 158 ri[WS(rs, 5)] = FNMS(KP974927912, T13, T11);
Chris@82 159 ri[WS(rs, 2)] = FMA(KP974927912, T13, T11);
Chris@82 160 }
Chris@82 161 {
Chris@82 162 E T1m, T1p, T1l, T1o, T1n;
Chris@82 163 T1l = FNMS(KP356895867, T19, T1b);
Chris@82 164 T1m = FNMS(KP692021471, T1l, T1a);
Chris@82 165 T1o = FMA(KP554958132, T1g, T1i);
Chris@82 166 T1p = FNMS(KP801937735, T1o, T1h);
Chris@82 167 T1n = FNMS(KP900968867, T1m, T1c);
Chris@82 168 ii[WS(rs, 2)] = FMA(KP974927912, T1p, T1n);
Chris@82 169 ii[WS(rs, 5)] = FNMS(KP974927912, T1p, T1n);
Chris@82 170 }
Chris@82 171 {
Chris@82 172 E T15, T18, T14, T17, T16;
Chris@82 173 T14 = FNMS(KP356895867, TE, Tr);
Chris@82 174 T15 = FNMS(KP692021471, T14, Te);
Chris@82 175 T17 = FNMS(KP554958132, TR, TM);
Chris@82 176 T18 = FNMS(KP801937735, T17, TW);
Chris@82 177 T16 = FNMS(KP900968867, T15, T1);
Chris@82 178 ri[WS(rs, 4)] = FNMS(KP974927912, T18, T16);
Chris@82 179 ri[WS(rs, 3)] = FMA(KP974927912, T18, T16);
Chris@82 180 }
Chris@82 181 {
Chris@82 182 E T1r, T1u, T1q, T1t, T1s;
Chris@82 183 T1q = FNMS(KP356895867, T1b, T1a);
Chris@82 184 T1r = FNMS(KP692021471, T1q, T19);
Chris@82 185 T1t = FNMS(KP554958132, T1h, T1g);
Chris@82 186 T1u = FNMS(KP801937735, T1t, T1i);
Chris@82 187 T1s = FNMS(KP900968867, T1r, T1c);
Chris@82 188 ii[WS(rs, 3)] = FMA(KP974927912, T1u, T1s);
Chris@82 189 ii[WS(rs, 4)] = FNMS(KP974927912, T1u, T1s);
Chris@82 190 }
Chris@82 191 }
Chris@82 192 }
Chris@82 193 }
Chris@82 194
Chris@82 195 static const tw_instr twinstr[] = {
Chris@82 196 {TW_FULL, 0, 7},
Chris@82 197 {TW_NEXT, 1, 0}
Chris@82 198 };
Chris@82 199
Chris@82 200 static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, {18, 12, 54, 0}, 0, 0, 0 };
Chris@82 201
Chris@82 202 void X(codelet_t1_7) (planner *p) {
Chris@82 203 X(kdft_dit_register) (p, t1_7, &desc);
Chris@82 204 }
Chris@82 205 #else
Chris@82 206
Chris@82 207 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
Chris@82 208
Chris@82 209 /*
Chris@82 210 * This function contains 72 FP additions, 60 FP multiplications,
Chris@82 211 * (or, 36 additions, 24 multiplications, 36 fused multiply/add),
Chris@82 212 * 29 stack variables, 6 constants, and 28 memory accesses
Chris@82 213 */
Chris@82 214 #include "dft/scalar/t.h"
Chris@82 215
Chris@82 216 static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 217 {
Chris@82 218 DK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@82 219 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@82 220 DK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@82 221 DK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@82 222 DK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@82 223 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@82 224 {
Chris@82 225 INT m;
Chris@82 226 for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@82 227 E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ;
Chris@82 228 T1 = ri[0];
Chris@82 229 TR = ii[0];
Chris@82 230 {
Chris@82 231 E T6, TA, Tb, TB;
Chris@82 232 {
Chris@82 233 E T3, T5, T2, T4;
Chris@82 234 T3 = ri[WS(rs, 1)];
Chris@82 235 T5 = ii[WS(rs, 1)];
Chris@82 236 T2 = W[0];
Chris@82 237 T4 = W[1];
Chris@82 238 T6 = FMA(T2, T3, T4 * T5);
Chris@82 239 TA = FNMS(T4, T3, T2 * T5);
Chris@82 240 }
Chris@82 241 {
Chris@82 242 E T8, Ta, T7, T9;
Chris@82 243 T8 = ri[WS(rs, 6)];
Chris@82 244 Ta = ii[WS(rs, 6)];
Chris@82 245 T7 = W[10];
Chris@82 246 T9 = W[11];
Chris@82 247 Tb = FMA(T7, T8, T9 * Ta);
Chris@82 248 TB = FNMS(T9, T8, T7 * Ta);
Chris@82 249 }
Chris@82 250 Tc = T6 + Tb;
Chris@82 251 TS = Tb - T6;
Chris@82 252 TC = TA - TB;
Chris@82 253 TO = TA + TB;
Chris@82 254 }
Chris@82 255 {
Chris@82 256 E Th, TG, Tm, TH;
Chris@82 257 {
Chris@82 258 E Te, Tg, Td, Tf;
Chris@82 259 Te = ri[WS(rs, 2)];
Chris@82 260 Tg = ii[WS(rs, 2)];
Chris@82 261 Td = W[2];
Chris@82 262 Tf = W[3];
Chris@82 263 Th = FMA(Td, Te, Tf * Tg);
Chris@82 264 TG = FNMS(Tf, Te, Td * Tg);
Chris@82 265 }
Chris@82 266 {
Chris@82 267 E Tj, Tl, Ti, Tk;
Chris@82 268 Tj = ri[WS(rs, 5)];
Chris@82 269 Tl = ii[WS(rs, 5)];
Chris@82 270 Ti = W[8];
Chris@82 271 Tk = W[9];
Chris@82 272 Tm = FMA(Ti, Tj, Tk * Tl);
Chris@82 273 TH = FNMS(Tk, Tj, Ti * Tl);
Chris@82 274 }
Chris@82 275 Tn = Th + Tm;
Chris@82 276 TT = Tm - Th;
Chris@82 277 TI = TG - TH;
Chris@82 278 TP = TG + TH;
Chris@82 279 }
Chris@82 280 {
Chris@82 281 E Ts, TD, Tx, TE;
Chris@82 282 {
Chris@82 283 E Tp, Tr, To, Tq;
Chris@82 284 Tp = ri[WS(rs, 3)];
Chris@82 285 Tr = ii[WS(rs, 3)];
Chris@82 286 To = W[4];
Chris@82 287 Tq = W[5];
Chris@82 288 Ts = FMA(To, Tp, Tq * Tr);
Chris@82 289 TD = FNMS(Tq, Tp, To * Tr);
Chris@82 290 }
Chris@82 291 {
Chris@82 292 E Tu, Tw, Tt, Tv;
Chris@82 293 Tu = ri[WS(rs, 4)];
Chris@82 294 Tw = ii[WS(rs, 4)];
Chris@82 295 Tt = W[6];
Chris@82 296 Tv = W[7];
Chris@82 297 Tx = FMA(Tt, Tu, Tv * Tw);
Chris@82 298 TE = FNMS(Tv, Tu, Tt * Tw);
Chris@82 299 }
Chris@82 300 Ty = Ts + Tx;
Chris@82 301 TU = Tx - Ts;
Chris@82 302 TF = TD - TE;
Chris@82 303 TQ = TD + TE;
Chris@82 304 }
Chris@82 305 ri[0] = T1 + Tc + Tn + Ty;
Chris@82 306 ii[0] = TO + TP + TQ + TR;
Chris@82 307 {
Chris@82 308 E TJ, Tz, TX, TY;
Chris@82 309 TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
Chris@82 310 Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
Chris@82 311 ri[WS(rs, 5)] = Tz - TJ;
Chris@82 312 ri[WS(rs, 2)] = Tz + TJ;
Chris@82 313 TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT);
Chris@82 314 TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO);
Chris@82 315 ii[WS(rs, 2)] = TX + TY;
Chris@82 316 ii[WS(rs, 5)] = TY - TX;
Chris@82 317 }
Chris@82 318 {
Chris@82 319 E TL, TK, TV, TW;
Chris@82 320 TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
Chris@82 321 TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
Chris@82 322 ri[WS(rs, 6)] = TK - TL;
Chris@82 323 ri[WS(rs, 1)] = TK + TL;
Chris@82 324 TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU);
Chris@82 325 TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP);
Chris@82 326 ii[WS(rs, 1)] = TV + TW;
Chris@82 327 ii[WS(rs, 6)] = TW - TV;
Chris@82 328 }
Chris@82 329 {
Chris@82 330 E TN, TM, TZ, T10;
Chris@82 331 TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
Chris@82 332 TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
Chris@82 333 ri[WS(rs, 4)] = TM - TN;
Chris@82 334 ri[WS(rs, 3)] = TM + TN;
Chris@82 335 TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT);
Chris@82 336 T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO);
Chris@82 337 ii[WS(rs, 3)] = TZ + T10;
Chris@82 338 ii[WS(rs, 4)] = T10 - TZ;
Chris@82 339 }
Chris@82 340 }
Chris@82 341 }
Chris@82 342 }
Chris@82 343
Chris@82 344 static const tw_instr twinstr[] = {
Chris@82 345 {TW_FULL, 0, 7},
Chris@82 346 {TW_NEXT, 1, 0}
Chris@82 347 };
Chris@82 348
Chris@82 349 static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, {36, 24, 36, 0}, 0, 0, 0 };
Chris@82 350
Chris@82 351 void X(codelet_t1_7) (planner *p) {
Chris@82 352 X(kdft_dit_register) (p, t1_7, &desc);
Chris@82 353 }
Chris@82 354 #endif