annotate src/fftw-3.3.8/dft/simd/common/t2bv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:01 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t2bv_8 -include dft/simd/t2b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 33 FP additions, 24 FP multiplications,
Chris@82 32 * (or, 23 additions, 14 multiplications, 10 fused multiply/add),
Chris@82 33 * 24 stack variables, 1 constants, and 16 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t2b.h"
Chris@82 36
Chris@82 37 static void t2bv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 R *x;
Chris@82 43 x = ii;
Chris@82 44 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@82 45 V T4, Tq, Tl, Tr, T9, Tt, Te, Tu, T1, T3, T2;
Chris@82 46 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 47 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 48 T3 = BYTW(&(W[TWVL * 6]), T2);
Chris@82 49 T4 = VSUB(T1, T3);
Chris@82 50 Tq = VADD(T1, T3);
Chris@82 51 {
Chris@82 52 V Ti, Tk, Th, Tj;
Chris@82 53 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 54 Ti = BYTW(&(W[TWVL * 2]), Th);
Chris@82 55 Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 56 Tk = BYTW(&(W[TWVL * 10]), Tj);
Chris@82 57 Tl = VSUB(Ti, Tk);
Chris@82 58 Tr = VADD(Ti, Tk);
Chris@82 59 }
Chris@82 60 {
Chris@82 61 V T6, T8, T5, T7;
Chris@82 62 T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 63 T6 = BYTW(&(W[0]), T5);
Chris@82 64 T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 65 T8 = BYTW(&(W[TWVL * 8]), T7);
Chris@82 66 T9 = VSUB(T6, T8);
Chris@82 67 Tt = VADD(T6, T8);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 V Tb, Td, Ta, Tc;
Chris@82 71 Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 72 Tb = BYTW(&(W[TWVL * 12]), Ta);
Chris@82 73 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 74 Td = BYTW(&(W[TWVL * 4]), Tc);
Chris@82 75 Te = VSUB(Tb, Td);
Chris@82 76 Tu = VADD(Tb, Td);
Chris@82 77 }
Chris@82 78 {
Chris@82 79 V Ts, Tv, Tw, Tx;
Chris@82 80 Ts = VSUB(Tq, Tr);
Chris@82 81 Tv = VSUB(Tt, Tu);
Chris@82 82 ST(&(x[WS(rs, 6)]), VFNMSI(Tv, Ts), ms, &(x[0]));
Chris@82 83 ST(&(x[WS(rs, 2)]), VFMAI(Tv, Ts), ms, &(x[0]));
Chris@82 84 Tw = VADD(Tq, Tr);
Chris@82 85 Tx = VADD(Tt, Tu);
Chris@82 86 ST(&(x[WS(rs, 4)]), VSUB(Tw, Tx), ms, &(x[0]));
Chris@82 87 ST(&(x[0]), VADD(Tw, Tx), ms, &(x[0]));
Chris@82 88 {
Chris@82 89 V Tg, To, Tn, Tp, Tf, Tm;
Chris@82 90 Tf = VADD(T9, Te);
Chris@82 91 Tg = VFNMS(LDK(KP707106781), Tf, T4);
Chris@82 92 To = VFMA(LDK(KP707106781), Tf, T4);
Chris@82 93 Tm = VSUB(T9, Te);
Chris@82 94 Tn = VFNMS(LDK(KP707106781), Tm, Tl);
Chris@82 95 Tp = VFMA(LDK(KP707106781), Tm, Tl);
Chris@82 96 ST(&(x[WS(rs, 3)]), VFNMSI(Tn, Tg), ms, &(x[WS(rs, 1)]));
Chris@82 97 ST(&(x[WS(rs, 7)]), VFNMSI(Tp, To), ms, &(x[WS(rs, 1)]));
Chris@82 98 ST(&(x[WS(rs, 5)]), VFMAI(Tn, Tg), ms, &(x[WS(rs, 1)]));
Chris@82 99 ST(&(x[WS(rs, 1)]), VFMAI(Tp, To), ms, &(x[WS(rs, 1)]));
Chris@82 100 }
Chris@82 101 }
Chris@82 102 }
Chris@82 103 }
Chris@82 104 VLEAVE();
Chris@82 105 }
Chris@82 106
Chris@82 107 static const tw_instr twinstr[] = {
Chris@82 108 VTW(0, 1),
Chris@82 109 VTW(0, 2),
Chris@82 110 VTW(0, 3),
Chris@82 111 VTW(0, 4),
Chris@82 112 VTW(0, 5),
Chris@82 113 VTW(0, 6),
Chris@82 114 VTW(0, 7),
Chris@82 115 {TW_NEXT, VL, 0}
Chris@82 116 };
Chris@82 117
Chris@82 118 static const ct_desc desc = { 8, XSIMD_STRING("t2bv_8"), twinstr, &GENUS, {23, 14, 10, 0}, 0, 0, 0 };
Chris@82 119
Chris@82 120 void XSIMD(codelet_t2bv_8) (planner *p) {
Chris@82 121 X(kdft_dit_register) (p, t2bv_8, &desc);
Chris@82 122 }
Chris@82 123 #else
Chris@82 124
Chris@82 125 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t2bv_8 -include dft/simd/t2b.h -sign 1 */
Chris@82 126
Chris@82 127 /*
Chris@82 128 * This function contains 33 FP additions, 16 FP multiplications,
Chris@82 129 * (or, 33 additions, 16 multiplications, 0 fused multiply/add),
Chris@82 130 * 24 stack variables, 1 constants, and 16 memory accesses
Chris@82 131 */
Chris@82 132 #include "dft/simd/t2b.h"
Chris@82 133
Chris@82 134 static void t2bv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 135 {
Chris@82 136 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 137 {
Chris@82 138 INT m;
Chris@82 139 R *x;
Chris@82 140 x = ii;
Chris@82 141 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@82 142 V Tl, Tq, Tg, Tr, T5, Tt, Ta, Tu, Ti, Tk, Tj;
Chris@82 143 Ti = LD(&(x[0]), ms, &(x[0]));
Chris@82 144 Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 145 Tk = BYTW(&(W[TWVL * 6]), Tj);
Chris@82 146 Tl = VSUB(Ti, Tk);
Chris@82 147 Tq = VADD(Ti, Tk);
Chris@82 148 {
Chris@82 149 V Td, Tf, Tc, Te;
Chris@82 150 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 151 Td = BYTW(&(W[TWVL * 2]), Tc);
Chris@82 152 Te = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 153 Tf = BYTW(&(W[TWVL * 10]), Te);
Chris@82 154 Tg = VSUB(Td, Tf);
Chris@82 155 Tr = VADD(Td, Tf);
Chris@82 156 }
Chris@82 157 {
Chris@82 158 V T2, T4, T1, T3;
Chris@82 159 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 160 T2 = BYTW(&(W[0]), T1);
Chris@82 161 T3 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 162 T4 = BYTW(&(W[TWVL * 8]), T3);
Chris@82 163 T5 = VSUB(T2, T4);
Chris@82 164 Tt = VADD(T2, T4);
Chris@82 165 }
Chris@82 166 {
Chris@82 167 V T7, T9, T6, T8;
Chris@82 168 T6 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 169 T7 = BYTW(&(W[TWVL * 12]), T6);
Chris@82 170 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 171 T9 = BYTW(&(W[TWVL * 4]), T8);
Chris@82 172 Ta = VSUB(T7, T9);
Chris@82 173 Tu = VADD(T7, T9);
Chris@82 174 }
Chris@82 175 {
Chris@82 176 V Ts, Tv, Tw, Tx;
Chris@82 177 Ts = VSUB(Tq, Tr);
Chris@82 178 Tv = VBYI(VSUB(Tt, Tu));
Chris@82 179 ST(&(x[WS(rs, 6)]), VSUB(Ts, Tv), ms, &(x[0]));
Chris@82 180 ST(&(x[WS(rs, 2)]), VADD(Ts, Tv), ms, &(x[0]));
Chris@82 181 Tw = VADD(Tq, Tr);
Chris@82 182 Tx = VADD(Tt, Tu);
Chris@82 183 ST(&(x[WS(rs, 4)]), VSUB(Tw, Tx), ms, &(x[0]));
Chris@82 184 ST(&(x[0]), VADD(Tw, Tx), ms, &(x[0]));
Chris@82 185 {
Chris@82 186 V Th, To, Tn, Tp, Tb, Tm;
Chris@82 187 Tb = VMUL(LDK(KP707106781), VSUB(T5, Ta));
Chris@82 188 Th = VBYI(VSUB(Tb, Tg));
Chris@82 189 To = VBYI(VADD(Tg, Tb));
Chris@82 190 Tm = VMUL(LDK(KP707106781), VADD(T5, Ta));
Chris@82 191 Tn = VSUB(Tl, Tm);
Chris@82 192 Tp = VADD(Tl, Tm);
Chris@82 193 ST(&(x[WS(rs, 3)]), VADD(Th, Tn), ms, &(x[WS(rs, 1)]));
Chris@82 194 ST(&(x[WS(rs, 7)]), VSUB(Tp, To), ms, &(x[WS(rs, 1)]));
Chris@82 195 ST(&(x[WS(rs, 5)]), VSUB(Tn, Th), ms, &(x[WS(rs, 1)]));
Chris@82 196 ST(&(x[WS(rs, 1)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
Chris@82 197 }
Chris@82 198 }
Chris@82 199 }
Chris@82 200 }
Chris@82 201 VLEAVE();
Chris@82 202 }
Chris@82 203
Chris@82 204 static const tw_instr twinstr[] = {
Chris@82 205 VTW(0, 1),
Chris@82 206 VTW(0, 2),
Chris@82 207 VTW(0, 3),
Chris@82 208 VTW(0, 4),
Chris@82 209 VTW(0, 5),
Chris@82 210 VTW(0, 6),
Chris@82 211 VTW(0, 7),
Chris@82 212 {TW_NEXT, VL, 0}
Chris@82 213 };
Chris@82 214
Chris@82 215 static const ct_desc desc = { 8, XSIMD_STRING("t2bv_8"), twinstr, &GENUS, {33, 16, 0, 0}, 0, 0, 0 };
Chris@82 216
Chris@82 217 void XSIMD(codelet_t2bv_8) (planner *p) {
Chris@82 218 X(kdft_dit_register) (p, t2bv_8, &desc);
Chris@82 219 }
Chris@82 220 #endif