annotate src/fftw-3.3.5/dft/scalar/codelets/t1_5.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:36:06 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include t.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 40 FP additions, 34 FP multiplications,
Chris@42 32 * (or, 14 additions, 8 multiplications, 26 fused multiply/add),
Chris@42 33 * 43 stack variables, 4 constants, and 20 memory accesses
Chris@42 34 */
Chris@42 35 #include "t.h"
Chris@42 36
Chris@42 37 static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@42 46 E T1, TM, TJ, TA, TQ, Te, TC, Tk, TE, Tq;
Chris@42 47 {
Chris@42 48 E Tg, Tj, Tm, TB, Th, Tp, Tl, Ti, To, TD, Tn;
Chris@42 49 T1 = ri[0];
Chris@42 50 TM = ii[0];
Chris@42 51 {
Chris@42 52 E T9, Tc, Ty, Ta, Tb, Tx, T7, Tf, Tz, Td;
Chris@42 53 {
Chris@42 54 E T3, T6, T8, Tw, T4, T2, T5;
Chris@42 55 T3 = ri[WS(rs, 1)];
Chris@42 56 T6 = ii[WS(rs, 1)];
Chris@42 57 T2 = W[0];
Chris@42 58 T9 = ri[WS(rs, 4)];
Chris@42 59 Tc = ii[WS(rs, 4)];
Chris@42 60 T8 = W[6];
Chris@42 61 Tw = T2 * T6;
Chris@42 62 T4 = T2 * T3;
Chris@42 63 T5 = W[1];
Chris@42 64 Ty = T8 * Tc;
Chris@42 65 Ta = T8 * T9;
Chris@42 66 Tb = W[7];
Chris@42 67 Tx = FNMS(T5, T3, Tw);
Chris@42 68 T7 = FMA(T5, T6, T4);
Chris@42 69 }
Chris@42 70 Tg = ri[WS(rs, 2)];
Chris@42 71 Tz = FNMS(Tb, T9, Ty);
Chris@42 72 Td = FMA(Tb, Tc, Ta);
Chris@42 73 Tj = ii[WS(rs, 2)];
Chris@42 74 Tf = W[2];
Chris@42 75 TJ = Tx + Tz;
Chris@42 76 TA = Tx - Tz;
Chris@42 77 TQ = T7 - Td;
Chris@42 78 Te = T7 + Td;
Chris@42 79 Tm = ri[WS(rs, 3)];
Chris@42 80 TB = Tf * Tj;
Chris@42 81 Th = Tf * Tg;
Chris@42 82 Tp = ii[WS(rs, 3)];
Chris@42 83 Tl = W[4];
Chris@42 84 Ti = W[3];
Chris@42 85 To = W[5];
Chris@42 86 }
Chris@42 87 TD = Tl * Tp;
Chris@42 88 Tn = Tl * Tm;
Chris@42 89 TC = FNMS(Ti, Tg, TB);
Chris@42 90 Tk = FMA(Ti, Tj, Th);
Chris@42 91 TE = FNMS(To, Tm, TD);
Chris@42 92 Tq = FMA(To, Tp, Tn);
Chris@42 93 }
Chris@42 94 {
Chris@42 95 E TG, TI, TO, TS, TU, Tu, TN, Tt, TK, TF;
Chris@42 96 TK = TC + TE;
Chris@42 97 TF = TC - TE;
Chris@42 98 {
Chris@42 99 E Tr, TR, TL, Ts;
Chris@42 100 Tr = Tk + Tq;
Chris@42 101 TR = Tk - Tq;
Chris@42 102 TG = FMA(KP618033988, TF, TA);
Chris@42 103 TI = FNMS(KP618033988, TA, TF);
Chris@42 104 TO = TJ - TK;
Chris@42 105 TL = TJ + TK;
Chris@42 106 TS = FMA(KP618033988, TR, TQ);
Chris@42 107 TU = FNMS(KP618033988, TQ, TR);
Chris@42 108 Tu = Te - Tr;
Chris@42 109 Ts = Te + Tr;
Chris@42 110 ii[0] = TL + TM;
Chris@42 111 TN = FNMS(KP250000000, TL, TM);
Chris@42 112 ri[0] = T1 + Ts;
Chris@42 113 Tt = FNMS(KP250000000, Ts, T1);
Chris@42 114 }
Chris@42 115 {
Chris@42 116 E TT, TP, TH, Tv;
Chris@42 117 TT = FNMS(KP559016994, TO, TN);
Chris@42 118 TP = FMA(KP559016994, TO, TN);
Chris@42 119 TH = FNMS(KP559016994, Tu, Tt);
Chris@42 120 Tv = FMA(KP559016994, Tu, Tt);
Chris@42 121 ii[WS(rs, 4)] = FMA(KP951056516, TS, TP);
Chris@42 122 ii[WS(rs, 1)] = FNMS(KP951056516, TS, TP);
Chris@42 123 ii[WS(rs, 3)] = FNMS(KP951056516, TU, TT);
Chris@42 124 ii[WS(rs, 2)] = FMA(KP951056516, TU, TT);
Chris@42 125 ri[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
Chris@42 126 ri[WS(rs, 4)] = FNMS(KP951056516, TG, Tv);
Chris@42 127 ri[WS(rs, 3)] = FMA(KP951056516, TI, TH);
Chris@42 128 ri[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
Chris@42 129 }
Chris@42 130 }
Chris@42 131 }
Chris@42 132 }
Chris@42 133 }
Chris@42 134
Chris@42 135 static const tw_instr twinstr[] = {
Chris@42 136 {TW_FULL, 0, 5},
Chris@42 137 {TW_NEXT, 1, 0}
Chris@42 138 };
Chris@42 139
Chris@42 140 static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, {14, 8, 26, 0}, 0, 0, 0 };
Chris@42 141
Chris@42 142 void X(codelet_t1_5) (planner *p) {
Chris@42 143 X(kdft_dit_register) (p, t1_5, &desc);
Chris@42 144 }
Chris@42 145 #else /* HAVE_FMA */
Chris@42 146
Chris@42 147 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include t.h */
Chris@42 148
Chris@42 149 /*
Chris@42 150 * This function contains 40 FP additions, 28 FP multiplications,
Chris@42 151 * (or, 26 additions, 14 multiplications, 14 fused multiply/add),
Chris@42 152 * 29 stack variables, 4 constants, and 20 memory accesses
Chris@42 153 */
Chris@42 154 #include "t.h"
Chris@42 155
Chris@42 156 static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 157 {
Chris@42 158 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 159 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 160 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 161 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 162 {
Chris@42 163 INT m;
Chris@42 164 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@42 165 E T1, TE, Tu, Tx, TJ, TI, TB, TC, TD, Tc, Tn, To;
Chris@42 166 T1 = ri[0];
Chris@42 167 TE = ii[0];
Chris@42 168 {
Chris@42 169 E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
Chris@42 170 {
Chris@42 171 E T3, T5, T2, T4;
Chris@42 172 T3 = ri[WS(rs, 1)];
Chris@42 173 T5 = ii[WS(rs, 1)];
Chris@42 174 T2 = W[0];
Chris@42 175 T4 = W[1];
Chris@42 176 T6 = FMA(T2, T3, T4 * T5);
Chris@42 177 Ts = FNMS(T4, T3, T2 * T5);
Chris@42 178 }
Chris@42 179 {
Chris@42 180 E Tj, Tl, Ti, Tk;
Chris@42 181 Tj = ri[WS(rs, 3)];
Chris@42 182 Tl = ii[WS(rs, 3)];
Chris@42 183 Ti = W[4];
Chris@42 184 Tk = W[5];
Chris@42 185 Tm = FMA(Ti, Tj, Tk * Tl);
Chris@42 186 Tw = FNMS(Tk, Tj, Ti * Tl);
Chris@42 187 }
Chris@42 188 {
Chris@42 189 E T8, Ta, T7, T9;
Chris@42 190 T8 = ri[WS(rs, 4)];
Chris@42 191 Ta = ii[WS(rs, 4)];
Chris@42 192 T7 = W[6];
Chris@42 193 T9 = W[7];
Chris@42 194 Tb = FMA(T7, T8, T9 * Ta);
Chris@42 195 Tt = FNMS(T9, T8, T7 * Ta);
Chris@42 196 }
Chris@42 197 {
Chris@42 198 E Te, Tg, Td, Tf;
Chris@42 199 Te = ri[WS(rs, 2)];
Chris@42 200 Tg = ii[WS(rs, 2)];
Chris@42 201 Td = W[2];
Chris@42 202 Tf = W[3];
Chris@42 203 Th = FMA(Td, Te, Tf * Tg);
Chris@42 204 Tv = FNMS(Tf, Te, Td * Tg);
Chris@42 205 }
Chris@42 206 Tu = Ts - Tt;
Chris@42 207 Tx = Tv - Tw;
Chris@42 208 TJ = Th - Tm;
Chris@42 209 TI = T6 - Tb;
Chris@42 210 TB = Ts + Tt;
Chris@42 211 TC = Tv + Tw;
Chris@42 212 TD = TB + TC;
Chris@42 213 Tc = T6 + Tb;
Chris@42 214 Tn = Th + Tm;
Chris@42 215 To = Tc + Tn;
Chris@42 216 }
Chris@42 217 ri[0] = T1 + To;
Chris@42 218 ii[0] = TD + TE;
Chris@42 219 {
Chris@42 220 E Ty, TA, Tr, Tz, Tp, Tq;
Chris@42 221 Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
Chris@42 222 TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
Chris@42 223 Tp = KP559016994 * (Tc - Tn);
Chris@42 224 Tq = FNMS(KP250000000, To, T1);
Chris@42 225 Tr = Tp + Tq;
Chris@42 226 Tz = Tq - Tp;
Chris@42 227 ri[WS(rs, 4)] = Tr - Ty;
Chris@42 228 ri[WS(rs, 3)] = Tz + TA;
Chris@42 229 ri[WS(rs, 1)] = Tr + Ty;
Chris@42 230 ri[WS(rs, 2)] = Tz - TA;
Chris@42 231 }
Chris@42 232 {
Chris@42 233 E TK, TL, TH, TM, TF, TG;
Chris@42 234 TK = FMA(KP951056516, TI, KP587785252 * TJ);
Chris@42 235 TL = FNMS(KP587785252, TI, KP951056516 * TJ);
Chris@42 236 TF = KP559016994 * (TB - TC);
Chris@42 237 TG = FNMS(KP250000000, TD, TE);
Chris@42 238 TH = TF + TG;
Chris@42 239 TM = TG - TF;
Chris@42 240 ii[WS(rs, 1)] = TH - TK;
Chris@42 241 ii[WS(rs, 3)] = TM - TL;
Chris@42 242 ii[WS(rs, 4)] = TK + TH;
Chris@42 243 ii[WS(rs, 2)] = TL + TM;
Chris@42 244 }
Chris@42 245 }
Chris@42 246 }
Chris@42 247 }
Chris@42 248
Chris@42 249 static const tw_instr twinstr[] = {
Chris@42 250 {TW_FULL, 0, 5},
Chris@42 251 {TW_NEXT, 1, 0}
Chris@42 252 };
Chris@42 253
Chris@42 254 static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, {26, 14, 14, 0}, 0, 0, 0 };
Chris@42 255
Chris@42 256 void X(codelet_t1_5) (planner *p) {
Chris@42 257 X(kdft_dit_register) (p, t1_5, &desc);
Chris@42 258 }
Chris@42 259 #endif /* HAVE_FMA */