annotate src/fftw-3.3.8/dft/simd/common/t1fv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:27 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1fv_10 -include dft/simd/t1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 51 FP additions, 40 FP multiplications,
Chris@82 32 * (or, 33 additions, 22 multiplications, 18 fused multiply/add),
Chris@82 33 * 32 stack variables, 4 constants, and 20 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t1f.h"
Chris@82 36
Chris@82 37 static void t1fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 R *x;
Chris@82 46 x = ri;
Chris@82 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@82 48 V T4, TA, Tk, Tp, Tq, TE, TF, TG, T9, Te, Tf, TB, TC, TD, T1;
Chris@82 49 V T3, T2;
Chris@82 50 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 51 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 52 T3 = BYTWJ(&(W[TWVL * 8]), T2);
Chris@82 53 T4 = VSUB(T1, T3);
Chris@82 54 TA = VADD(T1, T3);
Chris@82 55 {
Chris@82 56 V Th, To, Tj, Tm;
Chris@82 57 {
Chris@82 58 V Tg, Tn, Ti, Tl;
Chris@82 59 Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 60 Th = BYTWJ(&(W[TWVL * 6]), Tg);
Chris@82 61 Tn = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 62 To = BYTWJ(&(W[0]), Tn);
Chris@82 63 Ti = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 64 Tj = BYTWJ(&(W[TWVL * 16]), Ti);
Chris@82 65 Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 66 Tm = BYTWJ(&(W[TWVL * 10]), Tl);
Chris@82 67 }
Chris@82 68 Tk = VSUB(Th, Tj);
Chris@82 69 Tp = VSUB(Tm, To);
Chris@82 70 Tq = VADD(Tk, Tp);
Chris@82 71 TE = VADD(Th, Tj);
Chris@82 72 TF = VADD(Tm, To);
Chris@82 73 TG = VADD(TE, TF);
Chris@82 74 }
Chris@82 75 {
Chris@82 76 V T6, Td, T8, Tb;
Chris@82 77 {
Chris@82 78 V T5, Tc, T7, Ta;
Chris@82 79 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 80 T6 = BYTWJ(&(W[TWVL * 2]), T5);
Chris@82 81 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 82 Td = BYTWJ(&(W[TWVL * 4]), Tc);
Chris@82 83 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 84 T8 = BYTWJ(&(W[TWVL * 12]), T7);
Chris@82 85 Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 86 Tb = BYTWJ(&(W[TWVL * 14]), Ta);
Chris@82 87 }
Chris@82 88 T9 = VSUB(T6, T8);
Chris@82 89 Te = VSUB(Tb, Td);
Chris@82 90 Tf = VADD(T9, Te);
Chris@82 91 TB = VADD(T6, T8);
Chris@82 92 TC = VADD(Tb, Td);
Chris@82 93 TD = VADD(TB, TC);
Chris@82 94 }
Chris@82 95 {
Chris@82 96 V Tt, Tr, Ts, Tx, Tz, Tv, Tw, Ty, Tu;
Chris@82 97 Tt = VSUB(Tf, Tq);
Chris@82 98 Tr = VADD(Tf, Tq);
Chris@82 99 Ts = VFNMS(LDK(KP250000000), Tr, T4);
Chris@82 100 Tv = VSUB(T9, Te);
Chris@82 101 Tw = VSUB(Tk, Tp);
Chris@82 102 Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
Chris@82 103 Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
Chris@82 104 ST(&(x[WS(rs, 5)]), VADD(T4, Tr), ms, &(x[WS(rs, 1)]));
Chris@82 105 Ty = VFNMS(LDK(KP559016994), Tt, Ts);
Chris@82 106 ST(&(x[WS(rs, 3)]), VFNMSI(Tz, Ty), ms, &(x[WS(rs, 1)]));
Chris@82 107 ST(&(x[WS(rs, 7)]), VFMAI(Tz, Ty), ms, &(x[WS(rs, 1)]));
Chris@82 108 Tu = VFMA(LDK(KP559016994), Tt, Ts);
Chris@82 109 ST(&(x[WS(rs, 1)]), VFNMSI(Tx, Tu), ms, &(x[WS(rs, 1)]));
Chris@82 110 ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tu), ms, &(x[WS(rs, 1)]));
Chris@82 111 }
Chris@82 112 {
Chris@82 113 V TJ, TH, TI, TN, TP, TL, TM, TO, TK;
Chris@82 114 TJ = VSUB(TD, TG);
Chris@82 115 TH = VADD(TD, TG);
Chris@82 116 TI = VFNMS(LDK(KP250000000), TH, TA);
Chris@82 117 TL = VSUB(TE, TF);
Chris@82 118 TM = VSUB(TB, TC);
Chris@82 119 TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
Chris@82 120 TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
Chris@82 121 ST(&(x[0]), VADD(TA, TH), ms, &(x[0]));
Chris@82 122 TO = VFMA(LDK(KP559016994), TJ, TI);
Chris@82 123 ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
Chris@82 124 ST(&(x[WS(rs, 6)]), VFNMSI(TP, TO), ms, &(x[0]));
Chris@82 125 TK = VFNMS(LDK(KP559016994), TJ, TI);
Chris@82 126 ST(&(x[WS(rs, 2)]), VFMAI(TN, TK), ms, &(x[0]));
Chris@82 127 ST(&(x[WS(rs, 8)]), VFNMSI(TN, TK), ms, &(x[0]));
Chris@82 128 }
Chris@82 129 }
Chris@82 130 }
Chris@82 131 VLEAVE();
Chris@82 132 }
Chris@82 133
Chris@82 134 static const tw_instr twinstr[] = {
Chris@82 135 VTW(0, 1),
Chris@82 136 VTW(0, 2),
Chris@82 137 VTW(0, 3),
Chris@82 138 VTW(0, 4),
Chris@82 139 VTW(0, 5),
Chris@82 140 VTW(0, 6),
Chris@82 141 VTW(0, 7),
Chris@82 142 VTW(0, 8),
Chris@82 143 VTW(0, 9),
Chris@82 144 {TW_NEXT, VL, 0}
Chris@82 145 };
Chris@82 146
Chris@82 147 static const ct_desc desc = { 10, XSIMD_STRING("t1fv_10"), twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
Chris@82 148
Chris@82 149 void XSIMD(codelet_t1fv_10) (planner *p) {
Chris@82 150 X(kdft_dit_register) (p, t1fv_10, &desc);
Chris@82 151 }
Chris@82 152 #else
Chris@82 153
Chris@82 154 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1fv_10 -include dft/simd/t1f.h */
Chris@82 155
Chris@82 156 /*
Chris@82 157 * This function contains 51 FP additions, 30 FP multiplications,
Chris@82 158 * (or, 45 additions, 24 multiplications, 6 fused multiply/add),
Chris@82 159 * 32 stack variables, 4 constants, and 20 memory accesses
Chris@82 160 */
Chris@82 161 #include "dft/simd/t1f.h"
Chris@82 162
Chris@82 163 static void t1fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 164 {
Chris@82 165 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 166 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 167 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 168 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 169 {
Chris@82 170 INT m;
Chris@82 171 R *x;
Chris@82 172 x = ri;
Chris@82 173 for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@82 174 V Tr, TH, Tg, Tl, Tm, TA, TB, TJ, T5, Ta, Tb, TD, TE, TI, To;
Chris@82 175 V Tq, Tp;
Chris@82 176 To = LD(&(x[0]), ms, &(x[0]));
Chris@82 177 Tp = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 178 Tq = BYTWJ(&(W[TWVL * 8]), Tp);
Chris@82 179 Tr = VSUB(To, Tq);
Chris@82 180 TH = VADD(To, Tq);
Chris@82 181 {
Chris@82 182 V Td, Tk, Tf, Ti;
Chris@82 183 {
Chris@82 184 V Tc, Tj, Te, Th;
Chris@82 185 Tc = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 186 Td = BYTWJ(&(W[TWVL * 6]), Tc);
Chris@82 187 Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 188 Tk = BYTWJ(&(W[0]), Tj);
Chris@82 189 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 190 Tf = BYTWJ(&(W[TWVL * 16]), Te);
Chris@82 191 Th = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 192 Ti = BYTWJ(&(W[TWVL * 10]), Th);
Chris@82 193 }
Chris@82 194 Tg = VSUB(Td, Tf);
Chris@82 195 Tl = VSUB(Ti, Tk);
Chris@82 196 Tm = VADD(Tg, Tl);
Chris@82 197 TA = VADD(Td, Tf);
Chris@82 198 TB = VADD(Ti, Tk);
Chris@82 199 TJ = VADD(TA, TB);
Chris@82 200 }
Chris@82 201 {
Chris@82 202 V T2, T9, T4, T7;
Chris@82 203 {
Chris@82 204 V T1, T8, T3, T6;
Chris@82 205 T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 206 T2 = BYTWJ(&(W[TWVL * 2]), T1);
Chris@82 207 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 208 T9 = BYTWJ(&(W[TWVL * 4]), T8);
Chris@82 209 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 210 T4 = BYTWJ(&(W[TWVL * 12]), T3);
Chris@82 211 T6 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 212 T7 = BYTWJ(&(W[TWVL * 14]), T6);
Chris@82 213 }
Chris@82 214 T5 = VSUB(T2, T4);
Chris@82 215 Ta = VSUB(T7, T9);
Chris@82 216 Tb = VADD(T5, Ta);
Chris@82 217 TD = VADD(T2, T4);
Chris@82 218 TE = VADD(T7, T9);
Chris@82 219 TI = VADD(TD, TE);
Chris@82 220 }
Chris@82 221 {
Chris@82 222 V Tn, Ts, Tt, Tx, Tz, Tv, Tw, Ty, Tu;
Chris@82 223 Tn = VMUL(LDK(KP559016994), VSUB(Tb, Tm));
Chris@82 224 Ts = VADD(Tb, Tm);
Chris@82 225 Tt = VFNMS(LDK(KP250000000), Ts, Tr);
Chris@82 226 Tv = VSUB(T5, Ta);
Chris@82 227 Tw = VSUB(Tg, Tl);
Chris@82 228 Tx = VBYI(VFMA(LDK(KP951056516), Tv, VMUL(LDK(KP587785252), Tw)));
Chris@82 229 Tz = VBYI(VFNMS(LDK(KP587785252), Tv, VMUL(LDK(KP951056516), Tw)));
Chris@82 230 ST(&(x[WS(rs, 5)]), VADD(Tr, Ts), ms, &(x[WS(rs, 1)]));
Chris@82 231 Ty = VSUB(Tt, Tn);
Chris@82 232 ST(&(x[WS(rs, 3)]), VSUB(Ty, Tz), ms, &(x[WS(rs, 1)]));
Chris@82 233 ST(&(x[WS(rs, 7)]), VADD(Tz, Ty), ms, &(x[WS(rs, 1)]));
Chris@82 234 Tu = VADD(Tn, Tt);
Chris@82 235 ST(&(x[WS(rs, 1)]), VSUB(Tu, Tx), ms, &(x[WS(rs, 1)]));
Chris@82 236 ST(&(x[WS(rs, 9)]), VADD(Tx, Tu), ms, &(x[WS(rs, 1)]));
Chris@82 237 }
Chris@82 238 {
Chris@82 239 V TM, TK, TL, TG, TO, TC, TF, TP, TN;
Chris@82 240 TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
Chris@82 241 TK = VADD(TI, TJ);
Chris@82 242 TL = VFNMS(LDK(KP250000000), TK, TH);
Chris@82 243 TC = VSUB(TA, TB);
Chris@82 244 TF = VSUB(TD, TE);
Chris@82 245 TG = VBYI(VFNMS(LDK(KP587785252), TF, VMUL(LDK(KP951056516), TC)));
Chris@82 246 TO = VBYI(VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
Chris@82 247 ST(&(x[0]), VADD(TH, TK), ms, &(x[0]));
Chris@82 248 TP = VADD(TM, TL);
Chris@82 249 ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
Chris@82 250 ST(&(x[WS(rs, 6)]), VSUB(TP, TO), ms, &(x[0]));
Chris@82 251 TN = VSUB(TL, TM);
Chris@82 252 ST(&(x[WS(rs, 2)]), VADD(TG, TN), ms, &(x[0]));
Chris@82 253 ST(&(x[WS(rs, 8)]), VSUB(TN, TG), ms, &(x[0]));
Chris@82 254 }
Chris@82 255 }
Chris@82 256 }
Chris@82 257 VLEAVE();
Chris@82 258 }
Chris@82 259
Chris@82 260 static const tw_instr twinstr[] = {
Chris@82 261 VTW(0, 1),
Chris@82 262 VTW(0, 2),
Chris@82 263 VTW(0, 3),
Chris@82 264 VTW(0, 4),
Chris@82 265 VTW(0, 5),
Chris@82 266 VTW(0, 6),
Chris@82 267 VTW(0, 7),
Chris@82 268 VTW(0, 8),
Chris@82 269 VTW(0, 9),
Chris@82 270 {TW_NEXT, VL, 0}
Chris@82 271 };
Chris@82 272
Chris@82 273 static const ct_desc desc = { 10, XSIMD_STRING("t1fv_10"), twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
Chris@82 274
Chris@82 275 void XSIMD(codelet_t1fv_10) (planner *p) {
Chris@82 276 X(kdft_dit_register) (p, t1fv_10, &desc);
Chris@82 277 }
Chris@82 278 #endif