annotate src/fftw-3.3.5/dft/scalar/codelets/n1_7.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:35:50 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include n.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 60 FP additions, 42 FP multiplications,
Chris@42 32 * (or, 18 additions, 0 multiplications, 42 fused multiply/add),
Chris@42 33 * 51 stack variables, 6 constants, and 28 memory accesses
Chris@42 34 */
Chris@42 35 #include "n.h"
Chris@42 36
Chris@42 37 static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 40 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 41 DK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@42 42 DK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@42 43 DK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@42 44 DK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@42 45 {
Chris@42 46 INT i;
Chris@42 47 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
Chris@42 48 E Tz, TP, Ty, TK, TN, TE, Tw, TF;
Chris@42 49 {
Chris@42 50 E T1, TI, T4, TG, Ta, TT, Tp, TH, T7, Tk, TJ, TO, Tu, Tb, TB;
Chris@42 51 E Tg, Tl, Th, Ti;
Chris@42 52 T1 = ri[0];
Chris@42 53 Tz = ii[0];
Chris@42 54 {
Chris@42 55 E T5, T6, Te, Tf;
Chris@42 56 {
Chris@42 57 E T2, T3, T8, T9;
Chris@42 58 T2 = ri[WS(is, 1)];
Chris@42 59 T3 = ri[WS(is, 6)];
Chris@42 60 T8 = ri[WS(is, 3)];
Chris@42 61 T9 = ri[WS(is, 4)];
Chris@42 62 T5 = ri[WS(is, 2)];
Chris@42 63 TI = T3 - T2;
Chris@42 64 T4 = T2 + T3;
Chris@42 65 TG = T9 - T8;
Chris@42 66 Ta = T8 + T9;
Chris@42 67 T6 = ri[WS(is, 5)];
Chris@42 68 }
Chris@42 69 Te = ii[WS(is, 2)];
Chris@42 70 TT = FMA(KP554958132, TG, TI);
Chris@42 71 Tp = FNMS(KP356895867, T4, Ta);
Chris@42 72 TH = T6 - T5;
Chris@42 73 T7 = T5 + T6;
Chris@42 74 Tf = ii[WS(is, 5)];
Chris@42 75 Tk = ii[WS(is, 3)];
Chris@42 76 TJ = FNMS(KP554958132, TI, TH);
Chris@42 77 TO = FMA(KP554958132, TH, TG);
Chris@42 78 Tu = FNMS(KP356895867, Ta, T7);
Chris@42 79 Tb = FNMS(KP356895867, T7, T4);
Chris@42 80 TB = Te + Tf;
Chris@42 81 Tg = Te - Tf;
Chris@42 82 Tl = ii[WS(is, 4)];
Chris@42 83 Th = ii[WS(is, 1)];
Chris@42 84 Ti = ii[WS(is, 6)];
Chris@42 85 }
Chris@42 86 {
Chris@42 87 E Tm, TA, Tj, TD, Ts, TL, Tx, TU, To, TR, Td, TM, Tv;
Chris@42 88 {
Chris@42 89 E TC, TQ, Tn, Tc;
Chris@42 90 ro[0] = T1 + T4 + T7 + Ta;
Chris@42 91 TC = Tk + Tl;
Chris@42 92 Tm = Tk - Tl;
Chris@42 93 TA = Th + Ti;
Chris@42 94 Tj = Th - Ti;
Chris@42 95 TD = FNMS(KP356895867, TC, TB);
Chris@42 96 Ts = FMA(KP554958132, Tg, Tm);
Chris@42 97 TL = FNMS(KP356895867, TA, TC);
Chris@42 98 TQ = FNMS(KP356895867, TB, TA);
Chris@42 99 Tx = FNMS(KP554958132, Tj, Tg);
Chris@42 100 Tn = FMA(KP554958132, Tm, Tj);
Chris@42 101 io[0] = Tz + TA + TB + TC;
Chris@42 102 Tc = FNMS(KP692021471, Tb, Ta);
Chris@42 103 TU = FMA(KP801937735, TT, TH);
Chris@42 104 To = FMA(KP801937735, Tn, Tg);
Chris@42 105 TR = FNMS(KP692021471, TQ, TC);
Chris@42 106 Td = FNMS(KP900968867, Tc, T1);
Chris@42 107 }
Chris@42 108 {
Chris@42 109 E Tt, Tr, TS, Tq;
Chris@42 110 Tt = FNMS(KP801937735, Ts, Tj);
Chris@42 111 Tq = FNMS(KP692021471, Tp, T7);
Chris@42 112 TS = FNMS(KP900968867, TR, Tz);
Chris@42 113 ro[WS(os, 1)] = FMA(KP974927912, To, Td);
Chris@42 114 ro[WS(os, 6)] = FNMS(KP974927912, To, Td);
Chris@42 115 Tr = FNMS(KP900968867, Tq, T1);
Chris@42 116 io[WS(os, 6)] = FNMS(KP974927912, TU, TS);
Chris@42 117 io[WS(os, 1)] = FMA(KP974927912, TU, TS);
Chris@42 118 TP = FNMS(KP801937735, TO, TI);
Chris@42 119 ro[WS(os, 2)] = FMA(KP974927912, Tt, Tr);
Chris@42 120 ro[WS(os, 5)] = FNMS(KP974927912, Tt, Tr);
Chris@42 121 TM = FNMS(KP692021471, TL, TB);
Chris@42 122 }
Chris@42 123 Ty = FNMS(KP801937735, Tx, Tm);
Chris@42 124 Tv = FNMS(KP692021471, Tu, T4);
Chris@42 125 TK = FNMS(KP801937735, TJ, TG);
Chris@42 126 TN = FNMS(KP900968867, TM, Tz);
Chris@42 127 TE = FNMS(KP692021471, TD, TA);
Chris@42 128 Tw = FNMS(KP900968867, Tv, T1);
Chris@42 129 }
Chris@42 130 }
Chris@42 131 io[WS(os, 5)] = FNMS(KP974927912, TP, TN);
Chris@42 132 io[WS(os, 2)] = FMA(KP974927912, TP, TN);
Chris@42 133 TF = FNMS(KP900968867, TE, Tz);
Chris@42 134 ro[WS(os, 3)] = FMA(KP974927912, Ty, Tw);
Chris@42 135 ro[WS(os, 4)] = FNMS(KP974927912, Ty, Tw);
Chris@42 136 io[WS(os, 4)] = FNMS(KP974927912, TK, TF);
Chris@42 137 io[WS(os, 3)] = FMA(KP974927912, TK, TF);
Chris@42 138 }
Chris@42 139 }
Chris@42 140 }
Chris@42 141
Chris@42 142 static const kdft_desc desc = { 7, "n1_7", {18, 0, 42, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 143
Chris@42 144 void X(codelet_n1_7) (planner *p) {
Chris@42 145 X(kdft_register) (p, n1_7, &desc);
Chris@42 146 }
Chris@42 147
Chris@42 148 #else /* HAVE_FMA */
Chris@42 149
Chris@42 150 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include n.h */
Chris@42 151
Chris@42 152 /*
Chris@42 153 * This function contains 60 FP additions, 36 FP multiplications,
Chris@42 154 * (or, 36 additions, 12 multiplications, 24 fused multiply/add),
Chris@42 155 * 25 stack variables, 6 constants, and 28 memory accesses
Chris@42 156 */
Chris@42 157 #include "n.h"
Chris@42 158
Chris@42 159 static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 160 {
Chris@42 161 DK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@42 162 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 163 DK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@42 164 DK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@42 165 DK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@42 166 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 167 {
Chris@42 168 INT i;
Chris@42 169 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
Chris@42 170 E T1, Tu, T4, Tq, Te, Tx, T7, Ts, Tk, Tv, Ta, Tr, Th, Tw;
Chris@42 171 T1 = ri[0];
Chris@42 172 Tu = ii[0];
Chris@42 173 {
Chris@42 174 E T2, T3, Tc, Td;
Chris@42 175 T2 = ri[WS(is, 1)];
Chris@42 176 T3 = ri[WS(is, 6)];
Chris@42 177 T4 = T2 + T3;
Chris@42 178 Tq = T3 - T2;
Chris@42 179 Tc = ii[WS(is, 1)];
Chris@42 180 Td = ii[WS(is, 6)];
Chris@42 181 Te = Tc - Td;
Chris@42 182 Tx = Tc + Td;
Chris@42 183 }
Chris@42 184 {
Chris@42 185 E T5, T6, Ti, Tj;
Chris@42 186 T5 = ri[WS(is, 2)];
Chris@42 187 T6 = ri[WS(is, 5)];
Chris@42 188 T7 = T5 + T6;
Chris@42 189 Ts = T6 - T5;
Chris@42 190 Ti = ii[WS(is, 2)];
Chris@42 191 Tj = ii[WS(is, 5)];
Chris@42 192 Tk = Ti - Tj;
Chris@42 193 Tv = Ti + Tj;
Chris@42 194 }
Chris@42 195 {
Chris@42 196 E T8, T9, Tf, Tg;
Chris@42 197 T8 = ri[WS(is, 3)];
Chris@42 198 T9 = ri[WS(is, 4)];
Chris@42 199 Ta = T8 + T9;
Chris@42 200 Tr = T9 - T8;
Chris@42 201 Tf = ii[WS(is, 3)];
Chris@42 202 Tg = ii[WS(is, 4)];
Chris@42 203 Th = Tf - Tg;
Chris@42 204 Tw = Tf + Tg;
Chris@42 205 }
Chris@42 206 ro[0] = T1 + T4 + T7 + Ta;
Chris@42 207 io[0] = Tu + Tx + Tv + Tw;
Chris@42 208 {
Chris@42 209 E Tl, Tb, TB, TC;
Chris@42 210 Tl = FNMS(KP781831482, Th, KP974927912 * Te) - (KP433883739 * Tk);
Chris@42 211 Tb = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
Chris@42 212 ro[WS(os, 5)] = Tb - Tl;
Chris@42 213 ro[WS(os, 2)] = Tb + Tl;
Chris@42 214 TB = FNMS(KP781831482, Tr, KP974927912 * Tq) - (KP433883739 * Ts);
Chris@42 215 TC = FMA(KP623489801, Tw, Tu) + FNMA(KP900968867, Tv, KP222520933 * Tx);
Chris@42 216 io[WS(os, 2)] = TB + TC;
Chris@42 217 io[WS(os, 5)] = TC - TB;
Chris@42 218 }
Chris@42 219 {
Chris@42 220 E Tn, Tm, Tz, TA;
Chris@42 221 Tn = FMA(KP781831482, Te, KP974927912 * Tk) + (KP433883739 * Th);
Chris@42 222 Tm = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
Chris@42 223 ro[WS(os, 6)] = Tm - Tn;
Chris@42 224 ro[WS(os, 1)] = Tm + Tn;
Chris@42 225 Tz = FMA(KP781831482, Tq, KP974927912 * Ts) + (KP433883739 * Tr);
Chris@42 226 TA = FMA(KP623489801, Tx, Tu) + FNMA(KP900968867, Tw, KP222520933 * Tv);
Chris@42 227 io[WS(os, 1)] = Tz + TA;
Chris@42 228 io[WS(os, 6)] = TA - Tz;
Chris@42 229 }
Chris@42 230 {
Chris@42 231 E Tp, To, Tt, Ty;
Chris@42 232 Tp = FMA(KP433883739, Te, KP974927912 * Th) - (KP781831482 * Tk);
Chris@42 233 To = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
Chris@42 234 ro[WS(os, 4)] = To - Tp;
Chris@42 235 ro[WS(os, 3)] = To + Tp;
Chris@42 236 Tt = FMA(KP433883739, Tq, KP974927912 * Tr) - (KP781831482 * Ts);
Chris@42 237 Ty = FMA(KP623489801, Tv, Tu) + FNMA(KP222520933, Tw, KP900968867 * Tx);
Chris@42 238 io[WS(os, 3)] = Tt + Ty;
Chris@42 239 io[WS(os, 4)] = Ty - Tt;
Chris@42 240 }
Chris@42 241 }
Chris@42 242 }
Chris@42 243 }
Chris@42 244
Chris@42 245 static const kdft_desc desc = { 7, "n1_7", {36, 12, 24, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 246
Chris@42 247 void X(codelet_n1_7) (planner *p) {
Chris@42 248 X(kdft_register) (p, n1_7, &desc);
Chris@42 249 }
Chris@42 250
Chris@42 251 #endif /* HAVE_FMA */