annotate src/fftw-3.3.8/dft/scalar/codelets/t2_5.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:25 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 44 FP additions, 40 FP multiplications,
Chris@82 32 * (or, 14 additions, 10 multiplications, 30 fused multiply/add),
Chris@82 33 * 38 stack variables, 4 constants, and 20 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/t.h"
Chris@82 36
Chris@82 37 static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@82 46 E T2, Ta, T8, T5, Tb, Tm, Tf, Tj, T9, Te;
Chris@82 47 T2 = W[0];
Chris@82 48 Ta = W[3];
Chris@82 49 T8 = W[2];
Chris@82 50 T9 = T2 * T8;
Chris@82 51 Te = T2 * Ta;
Chris@82 52 T5 = W[1];
Chris@82 53 Tb = FNMS(T5, Ta, T9);
Chris@82 54 Tm = FNMS(T5, T8, Te);
Chris@82 55 Tf = FMA(T5, T8, Te);
Chris@82 56 Tj = FMA(T5, Ta, T9);
Chris@82 57 {
Chris@82 58 E T1, TO, T7, Th, Ti, Tz, TB, TL, To, Ts, Tt, TE, TG, TM;
Chris@82 59 T1 = ri[0];
Chris@82 60 TO = ii[0];
Chris@82 61 {
Chris@82 62 E T3, T4, T6, Ty, Tc, Td, Tg, TA;
Chris@82 63 T3 = ri[WS(rs, 1)];
Chris@82 64 T4 = T2 * T3;
Chris@82 65 T6 = ii[WS(rs, 1)];
Chris@82 66 Ty = T2 * T6;
Chris@82 67 Tc = ri[WS(rs, 4)];
Chris@82 68 Td = Tb * Tc;
Chris@82 69 Tg = ii[WS(rs, 4)];
Chris@82 70 TA = Tb * Tg;
Chris@82 71 T7 = FMA(T5, T6, T4);
Chris@82 72 Th = FMA(Tf, Tg, Td);
Chris@82 73 Ti = T7 + Th;
Chris@82 74 Tz = FNMS(T5, T3, Ty);
Chris@82 75 TB = FNMS(Tf, Tc, TA);
Chris@82 76 TL = Tz + TB;
Chris@82 77 }
Chris@82 78 {
Chris@82 79 E Tk, Tl, Tn, TD, Tp, Tq, Tr, TF;
Chris@82 80 Tk = ri[WS(rs, 2)];
Chris@82 81 Tl = Tj * Tk;
Chris@82 82 Tn = ii[WS(rs, 2)];
Chris@82 83 TD = Tj * Tn;
Chris@82 84 Tp = ri[WS(rs, 3)];
Chris@82 85 Tq = T8 * Tp;
Chris@82 86 Tr = ii[WS(rs, 3)];
Chris@82 87 TF = T8 * Tr;
Chris@82 88 To = FMA(Tm, Tn, Tl);
Chris@82 89 Ts = FMA(Ta, Tr, Tq);
Chris@82 90 Tt = To + Ts;
Chris@82 91 TE = FNMS(Tm, Tk, TD);
Chris@82 92 TG = FNMS(Ta, Tp, TF);
Chris@82 93 TM = TE + TG;
Chris@82 94 }
Chris@82 95 {
Chris@82 96 E Tw, Tu, Tv, TI, TK, TC, TH, TJ, Tx;
Chris@82 97 Tw = Ti - Tt;
Chris@82 98 Tu = Ti + Tt;
Chris@82 99 Tv = FNMS(KP250000000, Tu, T1);
Chris@82 100 TC = Tz - TB;
Chris@82 101 TH = TE - TG;
Chris@82 102 TI = FMA(KP618033988, TH, TC);
Chris@82 103 TK = FNMS(KP618033988, TC, TH);
Chris@82 104 ri[0] = T1 + Tu;
Chris@82 105 TJ = FNMS(KP559016994, Tw, Tv);
Chris@82 106 ri[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
Chris@82 107 ri[WS(rs, 3)] = FMA(KP951056516, TK, TJ);
Chris@82 108 Tx = FMA(KP559016994, Tw, Tv);
Chris@82 109 ri[WS(rs, 4)] = FNMS(KP951056516, TI, Tx);
Chris@82 110 ri[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
Chris@82 111 }
Chris@82 112 {
Chris@82 113 E TQ, TN, TP, TU, TW, TS, TT, TV, TR;
Chris@82 114 TQ = TL - TM;
Chris@82 115 TN = TL + TM;
Chris@82 116 TP = FNMS(KP250000000, TN, TO);
Chris@82 117 TS = T7 - Th;
Chris@82 118 TT = To - Ts;
Chris@82 119 TU = FMA(KP618033988, TT, TS);
Chris@82 120 TW = FNMS(KP618033988, TS, TT);
Chris@82 121 ii[0] = TN + TO;
Chris@82 122 TV = FNMS(KP559016994, TQ, TP);
Chris@82 123 ii[WS(rs, 2)] = FMA(KP951056516, TW, TV);
Chris@82 124 ii[WS(rs, 3)] = FNMS(KP951056516, TW, TV);
Chris@82 125 TR = FMA(KP559016994, TQ, TP);
Chris@82 126 ii[WS(rs, 1)] = FNMS(KP951056516, TU, TR);
Chris@82 127 ii[WS(rs, 4)] = FMA(KP951056516, TU, TR);
Chris@82 128 }
Chris@82 129 }
Chris@82 130 }
Chris@82 131 }
Chris@82 132 }
Chris@82 133
Chris@82 134 static const tw_instr twinstr[] = {
Chris@82 135 {TW_CEXP, 0, 1},
Chris@82 136 {TW_CEXP, 0, 3},
Chris@82 137 {TW_NEXT, 1, 0}
Chris@82 138 };
Chris@82 139
Chris@82 140 static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, {14, 10, 30, 0}, 0, 0, 0 };
Chris@82 141
Chris@82 142 void X(codelet_t2_5) (planner *p) {
Chris@82 143 X(kdft_dit_register) (p, t2_5, &desc);
Chris@82 144 }
Chris@82 145 #else
Chris@82 146
Chris@82 147 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
Chris@82 148
Chris@82 149 /*
Chris@82 150 * This function contains 44 FP additions, 32 FP multiplications,
Chris@82 151 * (or, 30 additions, 18 multiplications, 14 fused multiply/add),
Chris@82 152 * 37 stack variables, 4 constants, and 20 memory accesses
Chris@82 153 */
Chris@82 154 #include "dft/scalar/t.h"
Chris@82 155
Chris@82 156 static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 157 {
Chris@82 158 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 159 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 160 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 161 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 162 {
Chris@82 163 INT m;
Chris@82 164 for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@82 165 E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
Chris@82 166 {
Chris@82 167 E T8, Te, Ta, Td;
Chris@82 168 T2 = W[0];
Chris@82 169 T4 = W[1];
Chris@82 170 T7 = W[2];
Chris@82 171 T9 = W[3];
Chris@82 172 T8 = T2 * T7;
Chris@82 173 Te = T4 * T7;
Chris@82 174 Ta = T4 * T9;
Chris@82 175 Td = T2 * T9;
Chris@82 176 Tb = T8 - Ta;
Chris@82 177 Tl = Td - Te;
Chris@82 178 Tf = Td + Te;
Chris@82 179 Tj = T8 + Ta;
Chris@82 180 }
Chris@82 181 {
Chris@82 182 E T1, TI, Ty, TB, TN, TM, TF, TG, TH, Ti, Tr, Ts;
Chris@82 183 T1 = ri[0];
Chris@82 184 TI = ii[0];
Chris@82 185 {
Chris@82 186 E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
Chris@82 187 {
Chris@82 188 E T3, T5, To, Tp;
Chris@82 189 T3 = ri[WS(rs, 1)];
Chris@82 190 T5 = ii[WS(rs, 1)];
Chris@82 191 T6 = FMA(T2, T3, T4 * T5);
Chris@82 192 Tw = FNMS(T4, T3, T2 * T5);
Chris@82 193 To = ri[WS(rs, 3)];
Chris@82 194 Tp = ii[WS(rs, 3)];
Chris@82 195 Tq = FMA(T7, To, T9 * Tp);
Chris@82 196 TA = FNMS(T9, To, T7 * Tp);
Chris@82 197 }
Chris@82 198 {
Chris@82 199 E Tc, Tg, Tk, Tm;
Chris@82 200 Tc = ri[WS(rs, 4)];
Chris@82 201 Tg = ii[WS(rs, 4)];
Chris@82 202 Th = FMA(Tb, Tc, Tf * Tg);
Chris@82 203 Tx = FNMS(Tf, Tc, Tb * Tg);
Chris@82 204 Tk = ri[WS(rs, 2)];
Chris@82 205 Tm = ii[WS(rs, 2)];
Chris@82 206 Tn = FMA(Tj, Tk, Tl * Tm);
Chris@82 207 Tz = FNMS(Tl, Tk, Tj * Tm);
Chris@82 208 }
Chris@82 209 Ty = Tw - Tx;
Chris@82 210 TB = Tz - TA;
Chris@82 211 TN = Tn - Tq;
Chris@82 212 TM = T6 - Th;
Chris@82 213 TF = Tw + Tx;
Chris@82 214 TG = Tz + TA;
Chris@82 215 TH = TF + TG;
Chris@82 216 Ti = T6 + Th;
Chris@82 217 Tr = Tn + Tq;
Chris@82 218 Ts = Ti + Tr;
Chris@82 219 }
Chris@82 220 ri[0] = T1 + Ts;
Chris@82 221 ii[0] = TH + TI;
Chris@82 222 {
Chris@82 223 E TC, TE, Tv, TD, Tt, Tu;
Chris@82 224 TC = FMA(KP951056516, Ty, KP587785252 * TB);
Chris@82 225 TE = FNMS(KP587785252, Ty, KP951056516 * TB);
Chris@82 226 Tt = KP559016994 * (Ti - Tr);
Chris@82 227 Tu = FNMS(KP250000000, Ts, T1);
Chris@82 228 Tv = Tt + Tu;
Chris@82 229 TD = Tu - Tt;
Chris@82 230 ri[WS(rs, 4)] = Tv - TC;
Chris@82 231 ri[WS(rs, 3)] = TD + TE;
Chris@82 232 ri[WS(rs, 1)] = Tv + TC;
Chris@82 233 ri[WS(rs, 2)] = TD - TE;
Chris@82 234 }
Chris@82 235 {
Chris@82 236 E TO, TP, TL, TQ, TJ, TK;
Chris@82 237 TO = FMA(KP951056516, TM, KP587785252 * TN);
Chris@82 238 TP = FNMS(KP587785252, TM, KP951056516 * TN);
Chris@82 239 TJ = KP559016994 * (TF - TG);
Chris@82 240 TK = FNMS(KP250000000, TH, TI);
Chris@82 241 TL = TJ + TK;
Chris@82 242 TQ = TK - TJ;
Chris@82 243 ii[WS(rs, 1)] = TL - TO;
Chris@82 244 ii[WS(rs, 3)] = TQ - TP;
Chris@82 245 ii[WS(rs, 4)] = TO + TL;
Chris@82 246 ii[WS(rs, 2)] = TP + TQ;
Chris@82 247 }
Chris@82 248 }
Chris@82 249 }
Chris@82 250 }
Chris@82 251 }
Chris@82 252
Chris@82 253 static const tw_instr twinstr[] = {
Chris@82 254 {TW_CEXP, 0, 1},
Chris@82 255 {TW_CEXP, 0, 3},
Chris@82 256 {TW_NEXT, 1, 0}
Chris@82 257 };
Chris@82 258
Chris@82 259 static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, {30, 18, 14, 0}, 0, 0, 0 };
Chris@82 260
Chris@82 261 void X(codelet_t2_5) (planner *p) {
Chris@82 262 X(kdft_dit_register) (p, t2_5, &desc);
Chris@82 263 }
Chris@82 264 #endif