annotate src/fftw-3.3.8/dft/simd/common/t1fv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:27 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1fv_8 -include dft/simd/t1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 33 FP additions, 24 FP multiplications,
Chris@82 32 * (or, 23 additions, 14 multiplications, 10 fused multiply/add),
Chris@82 33 * 24 stack variables, 1 constants, and 16 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t1f.h"
Chris@82 36
Chris@82 37 static void t1fv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 R *x;
Chris@82 43 x = ri;
Chris@82 44 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@82 45 V T4, Tq, Tl, Tr, T9, Tt, Te, Tu, T1, T3, T2;
Chris@82 46 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 47 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 48 T3 = BYTWJ(&(W[TWVL * 6]), T2);
Chris@82 49 T4 = VSUB(T1, T3);
Chris@82 50 Tq = VADD(T1, T3);
Chris@82 51 {
Chris@82 52 V Ti, Tk, Th, Tj;
Chris@82 53 Th = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 54 Ti = BYTWJ(&(W[TWVL * 2]), Th);
Chris@82 55 Tj = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 56 Tk = BYTWJ(&(W[TWVL * 10]), Tj);
Chris@82 57 Tl = VSUB(Ti, Tk);
Chris@82 58 Tr = VADD(Ti, Tk);
Chris@82 59 }
Chris@82 60 {
Chris@82 61 V T6, T8, T5, T7;
Chris@82 62 T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 63 T6 = BYTWJ(&(W[0]), T5);
Chris@82 64 T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 65 T8 = BYTWJ(&(W[TWVL * 8]), T7);
Chris@82 66 T9 = VSUB(T6, T8);
Chris@82 67 Tt = VADD(T6, T8);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 V Tb, Td, Ta, Tc;
Chris@82 71 Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 72 Tb = BYTWJ(&(W[TWVL * 12]), Ta);
Chris@82 73 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 74 Td = BYTWJ(&(W[TWVL * 4]), Tc);
Chris@82 75 Te = VSUB(Tb, Td);
Chris@82 76 Tu = VADD(Tb, Td);
Chris@82 77 }
Chris@82 78 {
Chris@82 79 V Ts, Tv, Tw, Tx;
Chris@82 80 Ts = VADD(Tq, Tr);
Chris@82 81 Tv = VADD(Tt, Tu);
Chris@82 82 ST(&(x[WS(rs, 4)]), VSUB(Ts, Tv), ms, &(x[0]));
Chris@82 83 ST(&(x[0]), VADD(Ts, Tv), ms, &(x[0]));
Chris@82 84 Tw = VSUB(Tq, Tr);
Chris@82 85 Tx = VSUB(Tu, Tt);
Chris@82 86 ST(&(x[WS(rs, 6)]), VFNMSI(Tx, Tw), ms, &(x[0]));
Chris@82 87 ST(&(x[WS(rs, 2)]), VFMAI(Tx, Tw), ms, &(x[0]));
Chris@82 88 {
Chris@82 89 V Tg, To, Tn, Tp, Tf, Tm;
Chris@82 90 Tf = VADD(T9, Te);
Chris@82 91 Tg = VFMA(LDK(KP707106781), Tf, T4);
Chris@82 92 To = VFNMS(LDK(KP707106781), Tf, T4);
Chris@82 93 Tm = VSUB(Te, T9);
Chris@82 94 Tn = VFNMS(LDK(KP707106781), Tm, Tl);
Chris@82 95 Tp = VFMA(LDK(KP707106781), Tm, Tl);
Chris@82 96 ST(&(x[WS(rs, 1)]), VFNMSI(Tn, Tg), ms, &(x[WS(rs, 1)]));
Chris@82 97 ST(&(x[WS(rs, 3)]), VFMAI(Tp, To), ms, &(x[WS(rs, 1)]));
Chris@82 98 ST(&(x[WS(rs, 7)]), VFMAI(Tn, Tg), ms, &(x[WS(rs, 1)]));
Chris@82 99 ST(&(x[WS(rs, 5)]), VFNMSI(Tp, To), ms, &(x[WS(rs, 1)]));
Chris@82 100 }
Chris@82 101 }
Chris@82 102 }
Chris@82 103 }
Chris@82 104 VLEAVE();
Chris@82 105 }
Chris@82 106
Chris@82 107 static const tw_instr twinstr[] = {
Chris@82 108 VTW(0, 1),
Chris@82 109 VTW(0, 2),
Chris@82 110 VTW(0, 3),
Chris@82 111 VTW(0, 4),
Chris@82 112 VTW(0, 5),
Chris@82 113 VTW(0, 6),
Chris@82 114 VTW(0, 7),
Chris@82 115 {TW_NEXT, VL, 0}
Chris@82 116 };
Chris@82 117
Chris@82 118 static const ct_desc desc = { 8, XSIMD_STRING("t1fv_8"), twinstr, &GENUS, {23, 14, 10, 0}, 0, 0, 0 };
Chris@82 119
Chris@82 120 void XSIMD(codelet_t1fv_8) (planner *p) {
Chris@82 121 X(kdft_dit_register) (p, t1fv_8, &desc);
Chris@82 122 }
Chris@82 123 #else
Chris@82 124
Chris@82 125 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name t1fv_8 -include dft/simd/t1f.h */
Chris@82 126
Chris@82 127 /*
Chris@82 128 * This function contains 33 FP additions, 16 FP multiplications,
Chris@82 129 * (or, 33 additions, 16 multiplications, 0 fused multiply/add),
Chris@82 130 * 24 stack variables, 1 constants, and 16 memory accesses
Chris@82 131 */
Chris@82 132 #include "dft/simd/t1f.h"
Chris@82 133
Chris@82 134 static void t1fv_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 135 {
Chris@82 136 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 137 {
Chris@82 138 INT m;
Chris@82 139 R *x;
Chris@82 140 x = ri;
Chris@82 141 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@82 142 V T4, Tq, Tm, Tr, T9, Tt, Te, Tu, T1, T3, T2;
Chris@82 143 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 144 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 145 T3 = BYTWJ(&(W[TWVL * 6]), T2);
Chris@82 146 T4 = VSUB(T1, T3);
Chris@82 147 Tq = VADD(T1, T3);
Chris@82 148 {
Chris@82 149 V Tj, Tl, Ti, Tk;
Chris@82 150 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 151 Tj = BYTWJ(&(W[TWVL * 2]), Ti);
Chris@82 152 Tk = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 153 Tl = BYTWJ(&(W[TWVL * 10]), Tk);
Chris@82 154 Tm = VSUB(Tj, Tl);
Chris@82 155 Tr = VADD(Tj, Tl);
Chris@82 156 }
Chris@82 157 {
Chris@82 158 V T6, T8, T5, T7;
Chris@82 159 T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 160 T6 = BYTWJ(&(W[0]), T5);
Chris@82 161 T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 162 T8 = BYTWJ(&(W[TWVL * 8]), T7);
Chris@82 163 T9 = VSUB(T6, T8);
Chris@82 164 Tt = VADD(T6, T8);
Chris@82 165 }
Chris@82 166 {
Chris@82 167 V Tb, Td, Ta, Tc;
Chris@82 168 Ta = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 169 Tb = BYTWJ(&(W[TWVL * 12]), Ta);
Chris@82 170 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 171 Td = BYTWJ(&(W[TWVL * 4]), Tc);
Chris@82 172 Te = VSUB(Tb, Td);
Chris@82 173 Tu = VADD(Tb, Td);
Chris@82 174 }
Chris@82 175 {
Chris@82 176 V Ts, Tv, Tw, Tx;
Chris@82 177 Ts = VADD(Tq, Tr);
Chris@82 178 Tv = VADD(Tt, Tu);
Chris@82 179 ST(&(x[WS(rs, 4)]), VSUB(Ts, Tv), ms, &(x[0]));
Chris@82 180 ST(&(x[0]), VADD(Ts, Tv), ms, &(x[0]));
Chris@82 181 Tw = VSUB(Tq, Tr);
Chris@82 182 Tx = VBYI(VSUB(Tu, Tt));
Chris@82 183 ST(&(x[WS(rs, 6)]), VSUB(Tw, Tx), ms, &(x[0]));
Chris@82 184 ST(&(x[WS(rs, 2)]), VADD(Tw, Tx), ms, &(x[0]));
Chris@82 185 {
Chris@82 186 V Tg, To, Tn, Tp, Tf, Th;
Chris@82 187 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
Chris@82 188 Tg = VADD(T4, Tf);
Chris@82 189 To = VSUB(T4, Tf);
Chris@82 190 Th = VMUL(LDK(KP707106781), VSUB(Te, T9));
Chris@82 191 Tn = VBYI(VSUB(Th, Tm));
Chris@82 192 Tp = VBYI(VADD(Tm, Th));
Chris@82 193 ST(&(x[WS(rs, 7)]), VSUB(Tg, Tn), ms, &(x[WS(rs, 1)]));
Chris@82 194 ST(&(x[WS(rs, 3)]), VADD(To, Tp), ms, &(x[WS(rs, 1)]));
Chris@82 195 ST(&(x[WS(rs, 1)]), VADD(Tg, Tn), ms, &(x[WS(rs, 1)]));
Chris@82 196 ST(&(x[WS(rs, 5)]), VSUB(To, Tp), ms, &(x[WS(rs, 1)]));
Chris@82 197 }
Chris@82 198 }
Chris@82 199 }
Chris@82 200 }
Chris@82 201 VLEAVE();
Chris@82 202 }
Chris@82 203
Chris@82 204 static const tw_instr twinstr[] = {
Chris@82 205 VTW(0, 1),
Chris@82 206 VTW(0, 2),
Chris@82 207 VTW(0, 3),
Chris@82 208 VTW(0, 4),
Chris@82 209 VTW(0, 5),
Chris@82 210 VTW(0, 6),
Chris@82 211 VTW(0, 7),
Chris@82 212 {TW_NEXT, VL, 0}
Chris@82 213 };
Chris@82 214
Chris@82 215 static const ct_desc desc = { 8, XSIMD_STRING("t1fv_8"), twinstr, &GENUS, {33, 16, 0, 0}, 0, 0, 0 };
Chris@82 216
Chris@82 217 void XSIMD(codelet_t1fv_8) (planner *p) {
Chris@82 218 X(kdft_dit_register) (p, t1fv_8, &desc);
Chris@82 219 }
Chris@82 220 #endif