annotate src/fftw-3.3.5/dft/simd/common/t1fuv_7.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:41:49 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1fuv_7 -include t1fu.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 36 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 15 additions, 15 multiplications, 21 fused multiply/add),
Chris@42 33 * 42 stack variables, 6 constants, and 14 memory accesses
Chris@42 34 */
Chris@42 35 #include "t1fu.h"
Chris@42 36
Chris@42 37 static void t1fuv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 40 DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@42 41 DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 42 DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@42 43 DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@42 44 DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@42 45 {
Chris@42 46 INT m;
Chris@42 47 R *x;
Chris@42 48 x = ri;
Chris@42 49 for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
Chris@42 50 V T1, T2, T4, Te, Tc, T9, T7;
Chris@42 51 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 52 T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 53 T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 54 Te = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 55 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 56 T9 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 57 T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 58 {
Chris@42 59 V T3, T5, Tf, Td, Ta, T8;
Chris@42 60 T3 = BYTWJ(&(W[0]), T2);
Chris@42 61 T5 = BYTWJ(&(W[TWVL * 10]), T4);
Chris@42 62 Tf = BYTWJ(&(W[TWVL * 6]), Te);
Chris@42 63 Td = BYTWJ(&(W[TWVL * 4]), Tc);
Chris@42 64 Ta = BYTWJ(&(W[TWVL * 8]), T9);
Chris@42 65 T8 = BYTWJ(&(W[TWVL * 2]), T7);
Chris@42 66 {
Chris@42 67 V T6, Tk, Tg, Tl, Tb, Tm;
Chris@42 68 T6 = VADD(T3, T5);
Chris@42 69 Tk = VSUB(T5, T3);
Chris@42 70 Tg = VADD(Td, Tf);
Chris@42 71 Tl = VSUB(Tf, Td);
Chris@42 72 Tb = VADD(T8, Ta);
Chris@42 73 Tm = VSUB(Ta, T8);
Chris@42 74 {
Chris@42 75 V Th, Ts, Tp, Tu, Tn, Tx, Ti, Tt;
Chris@42 76 Th = VFNMS(LDK(KP356895867), T6, Tg);
Chris@42 77 Ts = VFMA(LDK(KP554958132), Tl, Tk);
Chris@42 78 ST(&(x[0]), VADD(T1, VADD(T6, VADD(Tb, Tg))), ms, &(x[0]));
Chris@42 79 Tp = VFNMS(LDK(KP356895867), Tb, T6);
Chris@42 80 Tu = VFNMS(LDK(KP356895867), Tg, Tb);
Chris@42 81 Tn = VFMA(LDK(KP554958132), Tm, Tl);
Chris@42 82 Tx = VFNMS(LDK(KP554958132), Tk, Tm);
Chris@42 83 Ti = VFNMS(LDK(KP692021471), Th, Tb);
Chris@42 84 Tt = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), Ts, Tm));
Chris@42 85 {
Chris@42 86 V Tq, Tv, To, Ty, Tj, Tr, Tw;
Chris@42 87 Tq = VFNMS(LDK(KP692021471), Tp, Tg);
Chris@42 88 Tv = VFNMS(LDK(KP692021471), Tu, T6);
Chris@42 89 To = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tn, Tk));
Chris@42 90 Ty = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tx, Tl));
Chris@42 91 Tj = VFNMS(LDK(KP900968867), Ti, T1);
Chris@42 92 Tr = VFNMS(LDK(KP900968867), Tq, T1);
Chris@42 93 Tw = VFNMS(LDK(KP900968867), Tv, T1);
Chris@42 94 ST(&(x[WS(rs, 2)]), VFMAI(To, Tj), ms, &(x[0]));
Chris@42 95 ST(&(x[WS(rs, 5)]), VFNMSI(To, Tj), ms, &(x[WS(rs, 1)]));
Chris@42 96 ST(&(x[WS(rs, 1)]), VFMAI(Tt, Tr), ms, &(x[WS(rs, 1)]));
Chris@42 97 ST(&(x[WS(rs, 6)]), VFNMSI(Tt, Tr), ms, &(x[0]));
Chris@42 98 ST(&(x[WS(rs, 3)]), VFMAI(Ty, Tw), ms, &(x[WS(rs, 1)]));
Chris@42 99 ST(&(x[WS(rs, 4)]), VFNMSI(Ty, Tw), ms, &(x[0]));
Chris@42 100 }
Chris@42 101 }
Chris@42 102 }
Chris@42 103 }
Chris@42 104 }
Chris@42 105 }
Chris@42 106 VLEAVE();
Chris@42 107 }
Chris@42 108
Chris@42 109 static const tw_instr twinstr[] = {
Chris@42 110 VTW(0, 1),
Chris@42 111 VTW(0, 2),
Chris@42 112 VTW(0, 3),
Chris@42 113 VTW(0, 4),
Chris@42 114 VTW(0, 5),
Chris@42 115 VTW(0, 6),
Chris@42 116 {TW_NEXT, VL, 0}
Chris@42 117 };
Chris@42 118
Chris@42 119 static const ct_desc desc = { 7, XSIMD_STRING("t1fuv_7"), twinstr, &GENUS, {15, 15, 21, 0}, 0, 0, 0 };
Chris@42 120
Chris@42 121 void XSIMD(codelet_t1fuv_7) (planner *p) {
Chris@42 122 X(kdft_dit_register) (p, t1fuv_7, &desc);
Chris@42 123 }
Chris@42 124 #else /* HAVE_FMA */
Chris@42 125
Chris@42 126 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 7 -name t1fuv_7 -include t1fu.h */
Chris@42 127
Chris@42 128 /*
Chris@42 129 * This function contains 36 FP additions, 30 FP multiplications,
Chris@42 130 * (or, 24 additions, 18 multiplications, 12 fused multiply/add),
Chris@42 131 * 21 stack variables, 6 constants, and 14 memory accesses
Chris@42 132 */
Chris@42 133 #include "t1fu.h"
Chris@42 134
Chris@42 135 static void t1fuv_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 136 {
Chris@42 137 DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 138 DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@42 139 DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@42 140 DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@42 141 DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 142 DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@42 143 {
Chris@42 144 INT m;
Chris@42 145 R *x;
Chris@42 146 x = ri;
Chris@42 147 for (m = mb, W = W + (mb * ((TWVL / VL) * 12)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 12), MAKE_VOLATILE_STRIDE(7, rs)) {
Chris@42 148 V T1, Tg, Tj, T6, Ti, Tb, Tk, Tp, To;
Chris@42 149 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 150 {
Chris@42 151 V Td, Tf, Tc, Te;
Chris@42 152 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 153 Td = BYTWJ(&(W[TWVL * 4]), Tc);
Chris@42 154 Te = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 155 Tf = BYTWJ(&(W[TWVL * 6]), Te);
Chris@42 156 Tg = VADD(Td, Tf);
Chris@42 157 Tj = VSUB(Tf, Td);
Chris@42 158 }
Chris@42 159 {
Chris@42 160 V T3, T5, T2, T4;
Chris@42 161 T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 162 T3 = BYTWJ(&(W[0]), T2);
Chris@42 163 T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 164 T5 = BYTWJ(&(W[TWVL * 10]), T4);
Chris@42 165 T6 = VADD(T3, T5);
Chris@42 166 Ti = VSUB(T5, T3);
Chris@42 167 }
Chris@42 168 {
Chris@42 169 V T8, Ta, T7, T9;
Chris@42 170 T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 171 T8 = BYTWJ(&(W[TWVL * 2]), T7);
Chris@42 172 T9 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 173 Ta = BYTWJ(&(W[TWVL * 8]), T9);
Chris@42 174 Tb = VADD(T8, Ta);
Chris@42 175 Tk = VSUB(Ta, T8);
Chris@42 176 }
Chris@42 177 ST(&(x[0]), VADD(T1, VADD(T6, VADD(Tb, Tg))), ms, &(x[0]));
Chris@42 178 Tp = VBYI(VFMA(LDK(KP433883739), Ti, VFNMS(LDK(KP781831482), Tk, VMUL(LDK(KP974927912), Tj))));
Chris@42 179 To = VFMA(LDK(KP623489801), Tb, VFNMS(LDK(KP222520933), Tg, VFNMS(LDK(KP900968867), T6, T1)));
Chris@42 180 ST(&(x[WS(rs, 4)]), VSUB(To, Tp), ms, &(x[0]));
Chris@42 181 ST(&(x[WS(rs, 3)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
Chris@42 182 {
Chris@42 183 V Tl, Th, Tn, Tm;
Chris@42 184 Tl = VBYI(VFNMS(LDK(KP781831482), Tj, VFNMS(LDK(KP433883739), Tk, VMUL(LDK(KP974927912), Ti))));
Chris@42 185 Th = VFMA(LDK(KP623489801), Tg, VFNMS(LDK(KP900968867), Tb, VFNMS(LDK(KP222520933), T6, T1)));
Chris@42 186 ST(&(x[WS(rs, 5)]), VSUB(Th, Tl), ms, &(x[WS(rs, 1)]));
Chris@42 187 ST(&(x[WS(rs, 2)]), VADD(Th, Tl), ms, &(x[0]));
Chris@42 188 Tn = VBYI(VFMA(LDK(KP781831482), Ti, VFMA(LDK(KP974927912), Tk, VMUL(LDK(KP433883739), Tj))));
Chris@42 189 Tm = VFMA(LDK(KP623489801), T6, VFNMS(LDK(KP900968867), Tg, VFNMS(LDK(KP222520933), Tb, T1)));
Chris@42 190 ST(&(x[WS(rs, 6)]), VSUB(Tm, Tn), ms, &(x[0]));
Chris@42 191 ST(&(x[WS(rs, 1)]), VADD(Tm, Tn), ms, &(x[WS(rs, 1)]));
Chris@42 192 }
Chris@42 193 }
Chris@42 194 }
Chris@42 195 VLEAVE();
Chris@42 196 }
Chris@42 197
Chris@42 198 static const tw_instr twinstr[] = {
Chris@42 199 VTW(0, 1),
Chris@42 200 VTW(0, 2),
Chris@42 201 VTW(0, 3),
Chris@42 202 VTW(0, 4),
Chris@42 203 VTW(0, 5),
Chris@42 204 VTW(0, 6),
Chris@42 205 {TW_NEXT, VL, 0}
Chris@42 206 };
Chris@42 207
Chris@42 208 static const ct_desc desc = { 7, XSIMD_STRING("t1fuv_7"), twinstr, &GENUS, {24, 18, 12, 0}, 0, 0, 0 };
Chris@42 209
Chris@42 210 void XSIMD(codelet_t1fuv_7) (planner *p) {
Chris@42 211 X(kdft_dit_register) (p, t1fuv_7, &desc);
Chris@42 212 }
Chris@42 213 #endif /* HAVE_FMA */