annotate src/fftw-3.3.8/dft/simd/common/t2sv_4.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:11 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 4 -name t2sv_4 -include dft/simd/ts.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 24 FP additions, 16 FP multiplications,
Chris@82 32 * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
Chris@82 33 * 21 stack variables, 0 constants, and 16 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/ts.h"
Chris@82 36
Chris@82 37 static void t2sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 {
Chris@82 40 INT m;
Chris@82 41 for (m = mb, W = W + (mb * 4); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 4), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@82 42 V T2, T6, T3, T5, T7, Tb, T4, Ta;
Chris@82 43 T2 = LDW(&(W[0]));
Chris@82 44 T6 = LDW(&(W[TWVL * 3]));
Chris@82 45 T3 = LDW(&(W[TWVL * 2]));
Chris@82 46 T4 = VMUL(T2, T3);
Chris@82 47 Ta = VMUL(T2, T6);
Chris@82 48 T5 = LDW(&(W[TWVL * 1]));
Chris@82 49 T7 = VFMA(T5, T6, T4);
Chris@82 50 Tb = VFNMS(T5, T3, Ta);
Chris@82 51 {
Chris@82 52 V T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
Chris@82 53 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 54 Tx = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 55 {
Chris@82 56 V T8, T9, Tc, Tv;
Chris@82 57 T8 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 58 T9 = VMUL(T7, T8);
Chris@82 59 Tc = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 60 Tv = VMUL(T7, Tc);
Chris@82 61 Td = VFMA(Tb, Tc, T9);
Chris@82 62 Tw = VFNMS(Tb, T8, Tv);
Chris@82 63 }
Chris@82 64 {
Chris@82 65 V Tf, Tg, Th, Tp;
Chris@82 66 Tf = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 67 Tg = VMUL(T2, Tf);
Chris@82 68 Th = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 69 Tp = VMUL(T2, Th);
Chris@82 70 Ti = VFMA(T5, Th, Tg);
Chris@82 71 Tq = VFNMS(T5, Tf, Tp);
Chris@82 72 }
Chris@82 73 {
Chris@82 74 V Tj, Tk, Tl, Tr;
Chris@82 75 Tj = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 76 Tk = VMUL(T3, Tj);
Chris@82 77 Tl = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 78 Tr = VMUL(T3, Tl);
Chris@82 79 Tm = VFMA(T6, Tl, Tk);
Chris@82 80 Ts = VFNMS(T6, Tj, Tr);
Chris@82 81 }
Chris@82 82 {
Chris@82 83 V Te, Tn, Tu, Ty;
Chris@82 84 Te = VADD(T1, Td);
Chris@82 85 Tn = VADD(Ti, Tm);
Chris@82 86 ST(&(ri[WS(rs, 2)]), VSUB(Te, Tn), ms, &(ri[0]));
Chris@82 87 ST(&(ri[0]), VADD(Te, Tn), ms, &(ri[0]));
Chris@82 88 Tu = VADD(Tq, Ts);
Chris@82 89 Ty = VADD(Tw, Tx);
Chris@82 90 ST(&(ii[0]), VADD(Tu, Ty), ms, &(ii[0]));
Chris@82 91 ST(&(ii[WS(rs, 2)]), VSUB(Ty, Tu), ms, &(ii[0]));
Chris@82 92 }
Chris@82 93 {
Chris@82 94 V To, Tt, Tz, TA;
Chris@82 95 To = VSUB(T1, Td);
Chris@82 96 Tt = VSUB(Tq, Ts);
Chris@82 97 ST(&(ri[WS(rs, 3)]), VSUB(To, Tt), ms, &(ri[WS(rs, 1)]));
Chris@82 98 ST(&(ri[WS(rs, 1)]), VADD(To, Tt), ms, &(ri[WS(rs, 1)]));
Chris@82 99 Tz = VSUB(Tx, Tw);
Chris@82 100 TA = VSUB(Ti, Tm);
Chris@82 101 ST(&(ii[WS(rs, 1)]), VSUB(Tz, TA), ms, &(ii[WS(rs, 1)]));
Chris@82 102 ST(&(ii[WS(rs, 3)]), VADD(TA, Tz), ms, &(ii[WS(rs, 1)]));
Chris@82 103 }
Chris@82 104 }
Chris@82 105 }
Chris@82 106 }
Chris@82 107 VLEAVE();
Chris@82 108 }
Chris@82 109
Chris@82 110 static const tw_instr twinstr[] = {
Chris@82 111 VTW(0, 1),
Chris@82 112 VTW(0, 3),
Chris@82 113 {TW_NEXT, (2 * VL), 0}
Chris@82 114 };
Chris@82 115
Chris@82 116 static const ct_desc desc = { 4, XSIMD_STRING("t2sv_4"), twinstr, &GENUS, {16, 8, 8, 0}, 0, 0, 0 };
Chris@82 117
Chris@82 118 void XSIMD(codelet_t2sv_4) (planner *p) {
Chris@82 119 X(kdft_dit_register) (p, t2sv_4, &desc);
Chris@82 120 }
Chris@82 121 #else
Chris@82 122
Chris@82 123 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 4 -name t2sv_4 -include dft/simd/ts.h */
Chris@82 124
Chris@82 125 /*
Chris@82 126 * This function contains 24 FP additions, 16 FP multiplications,
Chris@82 127 * (or, 16 additions, 8 multiplications, 8 fused multiply/add),
Chris@82 128 * 21 stack variables, 0 constants, and 16 memory accesses
Chris@82 129 */
Chris@82 130 #include "dft/simd/ts.h"
Chris@82 131
Chris@82 132 static void t2sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 133 {
Chris@82 134 {
Chris@82 135 INT m;
Chris@82 136 for (m = mb, W = W + (mb * 4); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 4), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@82 137 V T2, T4, T3, T5, T6, T8;
Chris@82 138 T2 = LDW(&(W[0]));
Chris@82 139 T4 = LDW(&(W[TWVL * 1]));
Chris@82 140 T3 = LDW(&(W[TWVL * 2]));
Chris@82 141 T5 = LDW(&(W[TWVL * 3]));
Chris@82 142 T6 = VFMA(T2, T3, VMUL(T4, T5));
Chris@82 143 T8 = VFNMS(T4, T3, VMUL(T2, T5));
Chris@82 144 {
Chris@82 145 V T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
Chris@82 146 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 147 Tp = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 148 T7 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 149 T9 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 150 Ta = VFMA(T6, T7, VMUL(T8, T9));
Chris@82 151 To = VFNMS(T8, T7, VMUL(T6, T9));
Chris@82 152 {
Chris@82 153 V Tc, Td, Tf, Tg;
Chris@82 154 Tc = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 155 Td = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 156 Te = VFMA(T2, Tc, VMUL(T4, Td));
Chris@82 157 Tk = VFNMS(T4, Tc, VMUL(T2, Td));
Chris@82 158 Tf = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 159 Tg = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 160 Th = VFMA(T3, Tf, VMUL(T5, Tg));
Chris@82 161 Tl = VFNMS(T5, Tf, VMUL(T3, Tg));
Chris@82 162 }
Chris@82 163 {
Chris@82 164 V Tb, Ti, Tn, Tq;
Chris@82 165 Tb = VADD(T1, Ta);
Chris@82 166 Ti = VADD(Te, Th);
Chris@82 167 ST(&(ri[WS(rs, 2)]), VSUB(Tb, Ti), ms, &(ri[0]));
Chris@82 168 ST(&(ri[0]), VADD(Tb, Ti), ms, &(ri[0]));
Chris@82 169 Tn = VADD(Tk, Tl);
Chris@82 170 Tq = VADD(To, Tp);
Chris@82 171 ST(&(ii[0]), VADD(Tn, Tq), ms, &(ii[0]));
Chris@82 172 ST(&(ii[WS(rs, 2)]), VSUB(Tq, Tn), ms, &(ii[0]));
Chris@82 173 }
Chris@82 174 {
Chris@82 175 V Tj, Tm, Tr, Ts;
Chris@82 176 Tj = VSUB(T1, Ta);
Chris@82 177 Tm = VSUB(Tk, Tl);
Chris@82 178 ST(&(ri[WS(rs, 3)]), VSUB(Tj, Tm), ms, &(ri[WS(rs, 1)]));
Chris@82 179 ST(&(ri[WS(rs, 1)]), VADD(Tj, Tm), ms, &(ri[WS(rs, 1)]));
Chris@82 180 Tr = VSUB(Tp, To);
Chris@82 181 Ts = VSUB(Te, Th);
Chris@82 182 ST(&(ii[WS(rs, 1)]), VSUB(Tr, Ts), ms, &(ii[WS(rs, 1)]));
Chris@82 183 ST(&(ii[WS(rs, 3)]), VADD(Ts, Tr), ms, &(ii[WS(rs, 1)]));
Chris@82 184 }
Chris@82 185 }
Chris@82 186 }
Chris@82 187 }
Chris@82 188 VLEAVE();
Chris@82 189 }
Chris@82 190
Chris@82 191 static const tw_instr twinstr[] = {
Chris@82 192 VTW(0, 1),
Chris@82 193 VTW(0, 3),
Chris@82 194 {TW_NEXT, (2 * VL), 0}
Chris@82 195 };
Chris@82 196
Chris@82 197 static const ct_desc desc = { 4, XSIMD_STRING("t2sv_4"), twinstr, &GENUS, {16, 8, 8, 0}, 0, 0, 0 };
Chris@82 198
Chris@82 199 void XSIMD(codelet_t2sv_4) (planner *p) {
Chris@82 200 X(kdft_dit_register) (p, t2sv_4, &desc);
Chris@82 201 }
Chris@82 202 #endif