annotate src/fftw-3.3.8/dft/simd/common/t1sv_4.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:09 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1sv_4 -include dft/simd/ts.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 22 FP additions, 12 FP multiplications,
Chris@82 32 * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
Chris@82 33 * 15 stack variables, 0 constants, and 16 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/ts.h"
Chris@82 36
Chris@82 37 static void t1sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 {
Chris@82 40 INT m;
Chris@82 41 for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@82 42 V T1, Tv, T7, Tu, Te, To, Tk, Tq;
Chris@82 43 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 44 Tv = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 45 {
Chris@82 46 V T3, T6, T4, Tt, T2, T5;
Chris@82 47 T3 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 48 T6 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 49 T2 = LDW(&(W[TWVL * 2]));
Chris@82 50 T4 = VMUL(T2, T3);
Chris@82 51 Tt = VMUL(T2, T6);
Chris@82 52 T5 = LDW(&(W[TWVL * 3]));
Chris@82 53 T7 = VFMA(T5, T6, T4);
Chris@82 54 Tu = VFNMS(T5, T3, Tt);
Chris@82 55 }
Chris@82 56 {
Chris@82 57 V Ta, Td, Tb, Tn, T9, Tc;
Chris@82 58 Ta = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 59 Td = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 60 T9 = LDW(&(W[0]));
Chris@82 61 Tb = VMUL(T9, Ta);
Chris@82 62 Tn = VMUL(T9, Td);
Chris@82 63 Tc = LDW(&(W[TWVL * 1]));
Chris@82 64 Te = VFMA(Tc, Td, Tb);
Chris@82 65 To = VFNMS(Tc, Ta, Tn);
Chris@82 66 }
Chris@82 67 {
Chris@82 68 V Tg, Tj, Th, Tp, Tf, Ti;
Chris@82 69 Tg = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 70 Tj = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 71 Tf = LDW(&(W[TWVL * 4]));
Chris@82 72 Th = VMUL(Tf, Tg);
Chris@82 73 Tp = VMUL(Tf, Tj);
Chris@82 74 Ti = LDW(&(W[TWVL * 5]));
Chris@82 75 Tk = VFMA(Ti, Tj, Th);
Chris@82 76 Tq = VFNMS(Ti, Tg, Tp);
Chris@82 77 }
Chris@82 78 {
Chris@82 79 V T8, Tl, Ts, Tw;
Chris@82 80 T8 = VADD(T1, T7);
Chris@82 81 Tl = VADD(Te, Tk);
Chris@82 82 ST(&(ri[WS(rs, 2)]), VSUB(T8, Tl), ms, &(ri[0]));
Chris@82 83 ST(&(ri[0]), VADD(T8, Tl), ms, &(ri[0]));
Chris@82 84 Ts = VADD(To, Tq);
Chris@82 85 Tw = VADD(Tu, Tv);
Chris@82 86 ST(&(ii[0]), VADD(Ts, Tw), ms, &(ii[0]));
Chris@82 87 ST(&(ii[WS(rs, 2)]), VSUB(Tw, Ts), ms, &(ii[0]));
Chris@82 88 }
Chris@82 89 {
Chris@82 90 V Tm, Tr, Tx, Ty;
Chris@82 91 Tm = VSUB(T1, T7);
Chris@82 92 Tr = VSUB(To, Tq);
Chris@82 93 ST(&(ri[WS(rs, 3)]), VSUB(Tm, Tr), ms, &(ri[WS(rs, 1)]));
Chris@82 94 ST(&(ri[WS(rs, 1)]), VADD(Tm, Tr), ms, &(ri[WS(rs, 1)]));
Chris@82 95 Tx = VSUB(Tv, Tu);
Chris@82 96 Ty = VSUB(Te, Tk);
Chris@82 97 ST(&(ii[WS(rs, 1)]), VSUB(Tx, Ty), ms, &(ii[WS(rs, 1)]));
Chris@82 98 ST(&(ii[WS(rs, 3)]), VADD(Ty, Tx), ms, &(ii[WS(rs, 1)]));
Chris@82 99 }
Chris@82 100 }
Chris@82 101 }
Chris@82 102 VLEAVE();
Chris@82 103 }
Chris@82 104
Chris@82 105 static const tw_instr twinstr[] = {
Chris@82 106 VTW(0, 1),
Chris@82 107 VTW(0, 2),
Chris@82 108 VTW(0, 3),
Chris@82 109 {TW_NEXT, (2 * VL), 0}
Chris@82 110 };
Chris@82 111
Chris@82 112 static const ct_desc desc = { 4, XSIMD_STRING("t1sv_4"), twinstr, &GENUS, {16, 6, 6, 0}, 0, 0, 0 };
Chris@82 113
Chris@82 114 void XSIMD(codelet_t1sv_4) (planner *p) {
Chris@82 115 X(kdft_dit_register) (p, t1sv_4, &desc);
Chris@82 116 }
Chris@82 117 #else
Chris@82 118
Chris@82 119 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1sv_4 -include dft/simd/ts.h */
Chris@82 120
Chris@82 121 /*
Chris@82 122 * This function contains 22 FP additions, 12 FP multiplications,
Chris@82 123 * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
Chris@82 124 * 13 stack variables, 0 constants, and 16 memory accesses
Chris@82 125 */
Chris@82 126 #include "dft/simd/ts.h"
Chris@82 127
Chris@82 128 static void t1sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 129 {
Chris@82 130 {
Chris@82 131 INT m;
Chris@82 132 for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@82 133 V T1, Tp, T6, To, Tc, Tk, Th, Tl;
Chris@82 134 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 135 Tp = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 136 {
Chris@82 137 V T3, T5, T2, T4;
Chris@82 138 T3 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 139 T5 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 140 T2 = LDW(&(W[TWVL * 2]));
Chris@82 141 T4 = LDW(&(W[TWVL * 3]));
Chris@82 142 T6 = VFMA(T2, T3, VMUL(T4, T5));
Chris@82 143 To = VFNMS(T4, T3, VMUL(T2, T5));
Chris@82 144 }
Chris@82 145 {
Chris@82 146 V T9, Tb, T8, Ta;
Chris@82 147 T9 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 148 Tb = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 149 T8 = LDW(&(W[0]));
Chris@82 150 Ta = LDW(&(W[TWVL * 1]));
Chris@82 151 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
Chris@82 152 Tk = VFNMS(Ta, T9, VMUL(T8, Tb));
Chris@82 153 }
Chris@82 154 {
Chris@82 155 V Te, Tg, Td, Tf;
Chris@82 156 Te = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 157 Tg = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 158 Td = LDW(&(W[TWVL * 4]));
Chris@82 159 Tf = LDW(&(W[TWVL * 5]));
Chris@82 160 Th = VFMA(Td, Te, VMUL(Tf, Tg));
Chris@82 161 Tl = VFNMS(Tf, Te, VMUL(Td, Tg));
Chris@82 162 }
Chris@82 163 {
Chris@82 164 V T7, Ti, Tn, Tq;
Chris@82 165 T7 = VADD(T1, T6);
Chris@82 166 Ti = VADD(Tc, Th);
Chris@82 167 ST(&(ri[WS(rs, 2)]), VSUB(T7, Ti), ms, &(ri[0]));
Chris@82 168 ST(&(ri[0]), VADD(T7, Ti), ms, &(ri[0]));
Chris@82 169 Tn = VADD(Tk, Tl);
Chris@82 170 Tq = VADD(To, Tp);
Chris@82 171 ST(&(ii[0]), VADD(Tn, Tq), ms, &(ii[0]));
Chris@82 172 ST(&(ii[WS(rs, 2)]), VSUB(Tq, Tn), ms, &(ii[0]));
Chris@82 173 }
Chris@82 174 {
Chris@82 175 V Tj, Tm, Tr, Ts;
Chris@82 176 Tj = VSUB(T1, T6);
Chris@82 177 Tm = VSUB(Tk, Tl);
Chris@82 178 ST(&(ri[WS(rs, 3)]), VSUB(Tj, Tm), ms, &(ri[WS(rs, 1)]));
Chris@82 179 ST(&(ri[WS(rs, 1)]), VADD(Tj, Tm), ms, &(ri[WS(rs, 1)]));
Chris@82 180 Tr = VSUB(Tp, To);
Chris@82 181 Ts = VSUB(Tc, Th);
Chris@82 182 ST(&(ii[WS(rs, 1)]), VSUB(Tr, Ts), ms, &(ii[WS(rs, 1)]));
Chris@82 183 ST(&(ii[WS(rs, 3)]), VADD(Ts, Tr), ms, &(ii[WS(rs, 1)]));
Chris@82 184 }
Chris@82 185 }
Chris@82 186 }
Chris@82 187 VLEAVE();
Chris@82 188 }
Chris@82 189
Chris@82 190 static const tw_instr twinstr[] = {
Chris@82 191 VTW(0, 1),
Chris@82 192 VTW(0, 2),
Chris@82 193 VTW(0, 3),
Chris@82 194 {TW_NEXT, (2 * VL), 0}
Chris@82 195 };
Chris@82 196
Chris@82 197 static const ct_desc desc = { 4, XSIMD_STRING("t1sv_4"), twinstr, &GENUS, {16, 6, 6, 0}, 0, 0, 0 };
Chris@82 198
Chris@82 199 void XSIMD(codelet_t1sv_4) (planner *p) {
Chris@82 200 X(kdft_dit_register) (p, t1sv_4, &desc);
Chris@82 201 }
Chris@82 202 #endif