annotate src/fftw-3.3.3/dft/simd/common/t1sv_4.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:24 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1sv_4 -include ts.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 22 FP additions, 12 FP multiplications,
Chris@10 32 * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
Chris@10 33 * 35 stack variables, 0 constants, and 16 memory accesses
Chris@10 34 */
Chris@10 35 #include "ts.h"
Chris@10 36
Chris@10 37 static void t1sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 {
Chris@10 40 INT m;
Chris@10 41 for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@10 42 V T1, Tv, T3, T6, T5, Ta, Td, Tc, Tg, Tj, Tt, T4, Tf, Ti, Tn;
Chris@10 43 V Tb, T2, T9;
Chris@10 44 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 45 Tv = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 46 T3 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 47 T6 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 48 T2 = LDW(&(W[TWVL * 2]));
Chris@10 49 T5 = LDW(&(W[TWVL * 3]));
Chris@10 50 Ta = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 51 Td = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 52 T9 = LDW(&(W[0]));
Chris@10 53 Tc = LDW(&(W[TWVL * 1]));
Chris@10 54 Tg = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 55 Tj = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 56 Tt = VMUL(T2, T6);
Chris@10 57 T4 = VMUL(T2, T3);
Chris@10 58 Tf = LDW(&(W[TWVL * 4]));
Chris@10 59 Ti = LDW(&(W[TWVL * 5]));
Chris@10 60 Tn = VMUL(T9, Td);
Chris@10 61 Tb = VMUL(T9, Ta);
Chris@10 62 {
Chris@10 63 V Tu, T7, Tp, Th, To, Te;
Chris@10 64 Tu = VFNMS(T5, T3, Tt);
Chris@10 65 T7 = VFMA(T5, T6, T4);
Chris@10 66 Tp = VMUL(Tf, Tj);
Chris@10 67 Th = VMUL(Tf, Tg);
Chris@10 68 To = VFNMS(Tc, Ta, Tn);
Chris@10 69 Te = VFMA(Tc, Td, Tb);
Chris@10 70 {
Chris@10 71 V Tw, Tx, T8, Tm, Tq, Tk;
Chris@10 72 Tw = VADD(Tu, Tv);
Chris@10 73 Tx = VSUB(Tv, Tu);
Chris@10 74 T8 = VADD(T1, T7);
Chris@10 75 Tm = VSUB(T1, T7);
Chris@10 76 Tq = VFNMS(Ti, Tg, Tp);
Chris@10 77 Tk = VFMA(Ti, Tj, Th);
Chris@10 78 {
Chris@10 79 V Ts, Tr, Tl, Ty;
Chris@10 80 Ts = VADD(To, Tq);
Chris@10 81 Tr = VSUB(To, Tq);
Chris@10 82 Tl = VADD(Te, Tk);
Chris@10 83 Ty = VSUB(Te, Tk);
Chris@10 84 ST(&(ri[WS(rs, 1)]), VADD(Tm, Tr), ms, &(ri[WS(rs, 1)]));
Chris@10 85 ST(&(ri[WS(rs, 3)]), VSUB(Tm, Tr), ms, &(ri[WS(rs, 1)]));
Chris@10 86 ST(&(ii[WS(rs, 2)]), VSUB(Tw, Ts), ms, &(ii[0]));
Chris@10 87 ST(&(ii[0]), VADD(Ts, Tw), ms, &(ii[0]));
Chris@10 88 ST(&(ii[WS(rs, 3)]), VADD(Ty, Tx), ms, &(ii[WS(rs, 1)]));
Chris@10 89 ST(&(ii[WS(rs, 1)]), VSUB(Tx, Ty), ms, &(ii[WS(rs, 1)]));
Chris@10 90 ST(&(ri[0]), VADD(T8, Tl), ms, &(ri[0]));
Chris@10 91 ST(&(ri[WS(rs, 2)]), VSUB(T8, Tl), ms, &(ri[0]));
Chris@10 92 }
Chris@10 93 }
Chris@10 94 }
Chris@10 95 }
Chris@10 96 }
Chris@10 97 VLEAVE();
Chris@10 98 }
Chris@10 99
Chris@10 100 static const tw_instr twinstr[] = {
Chris@10 101 VTW(0, 1),
Chris@10 102 VTW(0, 2),
Chris@10 103 VTW(0, 3),
Chris@10 104 {TW_NEXT, (2 * VL), 0}
Chris@10 105 };
Chris@10 106
Chris@10 107 static const ct_desc desc = { 4, XSIMD_STRING("t1sv_4"), twinstr, &GENUS, {16, 6, 6, 0}, 0, 0, 0 };
Chris@10 108
Chris@10 109 void XSIMD(codelet_t1sv_4) (planner *p) {
Chris@10 110 X(kdft_dit_register) (p, t1sv_4, &desc);
Chris@10 111 }
Chris@10 112 #else /* HAVE_FMA */
Chris@10 113
Chris@10 114 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -name t1sv_4 -include ts.h */
Chris@10 115
Chris@10 116 /*
Chris@10 117 * This function contains 22 FP additions, 12 FP multiplications,
Chris@10 118 * (or, 16 additions, 6 multiplications, 6 fused multiply/add),
Chris@10 119 * 13 stack variables, 0 constants, and 16 memory accesses
Chris@10 120 */
Chris@10 121 #include "ts.h"
Chris@10 122
Chris@10 123 static void t1sv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 124 {
Chris@10 125 {
Chris@10 126 INT m;
Chris@10 127 for (m = mb, W = W + (mb * 6); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 6), MAKE_VOLATILE_STRIDE(8, rs)) {
Chris@10 128 V T1, Tp, T6, To, Tc, Tk, Th, Tl;
Chris@10 129 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 130 Tp = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 131 {
Chris@10 132 V T3, T5, T2, T4;
Chris@10 133 T3 = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 134 T5 = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 135 T2 = LDW(&(W[TWVL * 2]));
Chris@10 136 T4 = LDW(&(W[TWVL * 3]));
Chris@10 137 T6 = VFMA(T2, T3, VMUL(T4, T5));
Chris@10 138 To = VFNMS(T4, T3, VMUL(T2, T5));
Chris@10 139 }
Chris@10 140 {
Chris@10 141 V T9, Tb, T8, Ta;
Chris@10 142 T9 = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 143 Tb = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 144 T8 = LDW(&(W[0]));
Chris@10 145 Ta = LDW(&(W[TWVL * 1]));
Chris@10 146 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
Chris@10 147 Tk = VFNMS(Ta, T9, VMUL(T8, Tb));
Chris@10 148 }
Chris@10 149 {
Chris@10 150 V Te, Tg, Td, Tf;
Chris@10 151 Te = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 152 Tg = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 153 Td = LDW(&(W[TWVL * 4]));
Chris@10 154 Tf = LDW(&(W[TWVL * 5]));
Chris@10 155 Th = VFMA(Td, Te, VMUL(Tf, Tg));
Chris@10 156 Tl = VFNMS(Tf, Te, VMUL(Td, Tg));
Chris@10 157 }
Chris@10 158 {
Chris@10 159 V T7, Ti, Tn, Tq;
Chris@10 160 T7 = VADD(T1, T6);
Chris@10 161 Ti = VADD(Tc, Th);
Chris@10 162 ST(&(ri[WS(rs, 2)]), VSUB(T7, Ti), ms, &(ri[0]));
Chris@10 163 ST(&(ri[0]), VADD(T7, Ti), ms, &(ri[0]));
Chris@10 164 Tn = VADD(Tk, Tl);
Chris@10 165 Tq = VADD(To, Tp);
Chris@10 166 ST(&(ii[0]), VADD(Tn, Tq), ms, &(ii[0]));
Chris@10 167 ST(&(ii[WS(rs, 2)]), VSUB(Tq, Tn), ms, &(ii[0]));
Chris@10 168 }
Chris@10 169 {
Chris@10 170 V Tj, Tm, Tr, Ts;
Chris@10 171 Tj = VSUB(T1, T6);
Chris@10 172 Tm = VSUB(Tk, Tl);
Chris@10 173 ST(&(ri[WS(rs, 3)]), VSUB(Tj, Tm), ms, &(ri[WS(rs, 1)]));
Chris@10 174 ST(&(ri[WS(rs, 1)]), VADD(Tj, Tm), ms, &(ri[WS(rs, 1)]));
Chris@10 175 Tr = VSUB(Tp, To);
Chris@10 176 Ts = VSUB(Tc, Th);
Chris@10 177 ST(&(ii[WS(rs, 1)]), VSUB(Tr, Ts), ms, &(ii[WS(rs, 1)]));
Chris@10 178 ST(&(ii[WS(rs, 3)]), VADD(Ts, Tr), ms, &(ii[WS(rs, 1)]));
Chris@10 179 }
Chris@10 180 }
Chris@10 181 }
Chris@10 182 VLEAVE();
Chris@10 183 }
Chris@10 184
Chris@10 185 static const tw_instr twinstr[] = {
Chris@10 186 VTW(0, 1),
Chris@10 187 VTW(0, 2),
Chris@10 188 VTW(0, 3),
Chris@10 189 {TW_NEXT, (2 * VL), 0}
Chris@10 190 };
Chris@10 191
Chris@10 192 static const ct_desc desc = { 4, XSIMD_STRING("t1sv_4"), twinstr, &GENUS, {16, 6, 6, 0}, 0, 0, 0 };
Chris@10 193
Chris@10 194 void XSIMD(codelet_t1sv_4) (planner *p) {
Chris@10 195 X(kdft_dit_register) (p, t1sv_4, &desc);
Chris@10 196 }
Chris@10 197 #endif /* HAVE_FMA */