annotate src/fftw-3.3.8/dft/simd/common/t3bv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:09 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3bv_10 -include dft/simd/t3b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 57 FP additions, 52 FP multiplications,
Chris@82 32 * (or, 39 additions, 34 multiplications, 18 fused multiply/add),
Chris@82 33 * 41 stack variables, 4 constants, and 20 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t3b.h"
Chris@82 36
Chris@82 37 static void t3bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 41 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 R *x;
Chris@82 46 x = ii;
Chris@82 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@82 48 V T2, T3, T4, Ta, T5, T6, Tt, Td, Th;
Chris@82 49 T2 = LDW(&(W[0]));
Chris@82 50 T3 = LDW(&(W[TWVL * 2]));
Chris@82 51 T4 = VZMUL(T2, T3);
Chris@82 52 Ta = VZMULJ(T2, T3);
Chris@82 53 T5 = LDW(&(W[TWVL * 4]));
Chris@82 54 T6 = VZMULJ(T4, T5);
Chris@82 55 Tt = VZMULJ(T3, T5);
Chris@82 56 Td = VZMULJ(Ta, T5);
Chris@82 57 Th = VZMULJ(T2, T5);
Chris@82 58 {
Chris@82 59 V T9, TJ, Ts, Ty, Tz, TN, TO, TP, Tg, Tm, Tn, TK, TL, TM, T1;
Chris@82 60 V T8, T7;
Chris@82 61 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 62 T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 63 T8 = VZMUL(T6, T7);
Chris@82 64 T9 = VSUB(T1, T8);
Chris@82 65 TJ = VADD(T1, T8);
Chris@82 66 {
Chris@82 67 V Tp, Tx, Tr, Tv;
Chris@82 68 {
Chris@82 69 V To, Tw, Tq, Tu;
Chris@82 70 To = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 71 Tp = VZMUL(T4, To);
Chris@82 72 Tw = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 73 Tx = VZMUL(T2, Tw);
Chris@82 74 Tq = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 75 Tr = VZMUL(T5, Tq);
Chris@82 76 Tu = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 77 Tv = VZMUL(Tt, Tu);
Chris@82 78 }
Chris@82 79 Ts = VSUB(Tp, Tr);
Chris@82 80 Ty = VSUB(Tv, Tx);
Chris@82 81 Tz = VADD(Ts, Ty);
Chris@82 82 TN = VADD(Tp, Tr);
Chris@82 83 TO = VADD(Tv, Tx);
Chris@82 84 TP = VADD(TN, TO);
Chris@82 85 }
Chris@82 86 {
Chris@82 87 V Tc, Tl, Tf, Tj;
Chris@82 88 {
Chris@82 89 V Tb, Tk, Te, Ti;
Chris@82 90 Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 91 Tc = VZMUL(Ta, Tb);
Chris@82 92 Tk = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 93 Tl = VZMUL(T3, Tk);
Chris@82 94 Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 95 Tf = VZMUL(Td, Te);
Chris@82 96 Ti = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 97 Tj = VZMUL(Th, Ti);
Chris@82 98 }
Chris@82 99 Tg = VSUB(Tc, Tf);
Chris@82 100 Tm = VSUB(Tj, Tl);
Chris@82 101 Tn = VADD(Tg, Tm);
Chris@82 102 TK = VADD(Tc, Tf);
Chris@82 103 TL = VADD(Tj, Tl);
Chris@82 104 TM = VADD(TK, TL);
Chris@82 105 }
Chris@82 106 {
Chris@82 107 V TC, TA, TB, TG, TI, TE, TF, TH, TD;
Chris@82 108 TC = VSUB(Tn, Tz);
Chris@82 109 TA = VADD(Tn, Tz);
Chris@82 110 TB = VFNMS(LDK(KP250000000), TA, T9);
Chris@82 111 TE = VSUB(Tg, Tm);
Chris@82 112 TF = VSUB(Ts, Ty);
Chris@82 113 TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TF, TE));
Chris@82 114 TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TE, TF));
Chris@82 115 ST(&(x[WS(rs, 5)]), VADD(T9, TA), ms, &(x[WS(rs, 1)]));
Chris@82 116 TH = VFNMS(LDK(KP559016994), TC, TB);
Chris@82 117 ST(&(x[WS(rs, 3)]), VFMAI(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@82 118 ST(&(x[WS(rs, 7)]), VFNMSI(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@82 119 TD = VFMA(LDK(KP559016994), TC, TB);
Chris@82 120 ST(&(x[WS(rs, 1)]), VFMAI(TG, TD), ms, &(x[WS(rs, 1)]));
Chris@82 121 ST(&(x[WS(rs, 9)]), VFNMSI(TG, TD), ms, &(x[WS(rs, 1)]));
Chris@82 122 }
Chris@82 123 {
Chris@82 124 V TS, TQ, TR, TW, TY, TU, TV, TX, TT;
Chris@82 125 TS = VSUB(TM, TP);
Chris@82 126 TQ = VADD(TM, TP);
Chris@82 127 TR = VFNMS(LDK(KP250000000), TQ, TJ);
Chris@82 128 TU = VSUB(TN, TO);
Chris@82 129 TV = VSUB(TK, TL);
Chris@82 130 TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TV, TU));
Chris@82 131 TY = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TU, TV));
Chris@82 132 ST(&(x[0]), VADD(TJ, TQ), ms, &(x[0]));
Chris@82 133 TX = VFMA(LDK(KP559016994), TS, TR);
Chris@82 134 ST(&(x[WS(rs, 4)]), VFNMSI(TY, TX), ms, &(x[0]));
Chris@82 135 ST(&(x[WS(rs, 6)]), VFMAI(TY, TX), ms, &(x[0]));
Chris@82 136 TT = VFNMS(LDK(KP559016994), TS, TR);
Chris@82 137 ST(&(x[WS(rs, 2)]), VFNMSI(TW, TT), ms, &(x[0]));
Chris@82 138 ST(&(x[WS(rs, 8)]), VFMAI(TW, TT), ms, &(x[0]));
Chris@82 139 }
Chris@82 140 }
Chris@82 141 }
Chris@82 142 }
Chris@82 143 VLEAVE();
Chris@82 144 }
Chris@82 145
Chris@82 146 static const tw_instr twinstr[] = {
Chris@82 147 VTW(0, 1),
Chris@82 148 VTW(0, 3),
Chris@82 149 VTW(0, 9),
Chris@82 150 {TW_NEXT, VL, 0}
Chris@82 151 };
Chris@82 152
Chris@82 153 static const ct_desc desc = { 10, XSIMD_STRING("t3bv_10"), twinstr, &GENUS, {39, 34, 18, 0}, 0, 0, 0 };
Chris@82 154
Chris@82 155 void XSIMD(codelet_t3bv_10) (planner *p) {
Chris@82 156 X(kdft_dit_register) (p, t3bv_10, &desc);
Chris@82 157 }
Chris@82 158 #else
Chris@82 159
Chris@82 160 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3bv_10 -include dft/simd/t3b.h -sign 1 */
Chris@82 161
Chris@82 162 /*
Chris@82 163 * This function contains 57 FP additions, 42 FP multiplications,
Chris@82 164 * (or, 51 additions, 36 multiplications, 6 fused multiply/add),
Chris@82 165 * 41 stack variables, 4 constants, and 20 memory accesses
Chris@82 166 */
Chris@82 167 #include "dft/simd/t3b.h"
Chris@82 168
Chris@82 169 static void t3bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 170 {
Chris@82 171 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 172 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 173 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 174 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 175 {
Chris@82 176 INT m;
Chris@82 177 R *x;
Chris@82 178 x = ii;
Chris@82 179 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@82 180 V T1, T2, T3, Ti, T6, T7, TA, Tb, To;
Chris@82 181 T1 = LDW(&(W[0]));
Chris@82 182 T2 = LDW(&(W[TWVL * 2]));
Chris@82 183 T3 = VZMULJ(T1, T2);
Chris@82 184 Ti = VZMUL(T1, T2);
Chris@82 185 T6 = LDW(&(W[TWVL * 4]));
Chris@82 186 T7 = VZMULJ(T3, T6);
Chris@82 187 TA = VZMULJ(Ti, T6);
Chris@82 188 Tb = VZMULJ(T1, T6);
Chris@82 189 To = VZMULJ(T2, T6);
Chris@82 190 {
Chris@82 191 V TD, TQ, Tn, Tt, Tx, TM, TN, TS, Ta, Tg, Tw, TJ, TK, TR, Tz;
Chris@82 192 V TC, TB;
Chris@82 193 Tz = LD(&(x[0]), ms, &(x[0]));
Chris@82 194 TB = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 195 TC = VZMUL(TA, TB);
Chris@82 196 TD = VSUB(Tz, TC);
Chris@82 197 TQ = VADD(Tz, TC);
Chris@82 198 {
Chris@82 199 V Tk, Ts, Tm, Tq;
Chris@82 200 {
Chris@82 201 V Tj, Tr, Tl, Tp;
Chris@82 202 Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 203 Tk = VZMUL(Ti, Tj);
Chris@82 204 Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 205 Ts = VZMUL(T1, Tr);
Chris@82 206 Tl = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 207 Tm = VZMUL(T6, Tl);
Chris@82 208 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 209 Tq = VZMUL(To, Tp);
Chris@82 210 }
Chris@82 211 Tn = VSUB(Tk, Tm);
Chris@82 212 Tt = VSUB(Tq, Ts);
Chris@82 213 Tx = VADD(Tn, Tt);
Chris@82 214 TM = VADD(Tk, Tm);
Chris@82 215 TN = VADD(Tq, Ts);
Chris@82 216 TS = VADD(TM, TN);
Chris@82 217 }
Chris@82 218 {
Chris@82 219 V T5, Tf, T9, Td;
Chris@82 220 {
Chris@82 221 V T4, Te, T8, Tc;
Chris@82 222 T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 223 T5 = VZMUL(T3, T4);
Chris@82 224 Te = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 225 Tf = VZMUL(T2, Te);
Chris@82 226 T8 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 227 T9 = VZMUL(T7, T8);
Chris@82 228 Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 229 Td = VZMUL(Tb, Tc);
Chris@82 230 }
Chris@82 231 Ta = VSUB(T5, T9);
Chris@82 232 Tg = VSUB(Td, Tf);
Chris@82 233 Tw = VADD(Ta, Tg);
Chris@82 234 TJ = VADD(T5, T9);
Chris@82 235 TK = VADD(Td, Tf);
Chris@82 236 TR = VADD(TJ, TK);
Chris@82 237 }
Chris@82 238 {
Chris@82 239 V Ty, TE, TF, Tv, TI, Th, Tu, TH, TG;
Chris@82 240 Ty = VMUL(LDK(KP559016994), VSUB(Tw, Tx));
Chris@82 241 TE = VADD(Tw, Tx);
Chris@82 242 TF = VFNMS(LDK(KP250000000), TE, TD);
Chris@82 243 Th = VSUB(Ta, Tg);
Chris@82 244 Tu = VSUB(Tn, Tt);
Chris@82 245 Tv = VBYI(VFMA(LDK(KP951056516), Th, VMUL(LDK(KP587785252), Tu)));
Chris@82 246 TI = VBYI(VFNMS(LDK(KP951056516), Tu, VMUL(LDK(KP587785252), Th)));
Chris@82 247 ST(&(x[WS(rs, 5)]), VADD(TD, TE), ms, &(x[WS(rs, 1)]));
Chris@82 248 TH = VSUB(TF, Ty);
Chris@82 249 ST(&(x[WS(rs, 3)]), VSUB(TH, TI), ms, &(x[WS(rs, 1)]));
Chris@82 250 ST(&(x[WS(rs, 7)]), VADD(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@82 251 TG = VADD(Ty, TF);
Chris@82 252 ST(&(x[WS(rs, 1)]), VADD(Tv, TG), ms, &(x[WS(rs, 1)]));
Chris@82 253 ST(&(x[WS(rs, 9)]), VSUB(TG, Tv), ms, &(x[WS(rs, 1)]));
Chris@82 254 }
Chris@82 255 {
Chris@82 256 V TV, TT, TU, TP, TY, TL, TO, TX, TW;
Chris@82 257 TV = VMUL(LDK(KP559016994), VSUB(TR, TS));
Chris@82 258 TT = VADD(TR, TS);
Chris@82 259 TU = VFNMS(LDK(KP250000000), TT, TQ);
Chris@82 260 TL = VSUB(TJ, TK);
Chris@82 261 TO = VSUB(TM, TN);
Chris@82 262 TP = VBYI(VFNMS(LDK(KP951056516), TO, VMUL(LDK(KP587785252), TL)));
Chris@82 263 TY = VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TO)));
Chris@82 264 ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
Chris@82 265 TX = VADD(TV, TU);
Chris@82 266 ST(&(x[WS(rs, 4)]), VSUB(TX, TY), ms, &(x[0]));
Chris@82 267 ST(&(x[WS(rs, 6)]), VADD(TY, TX), ms, &(x[0]));
Chris@82 268 TW = VSUB(TU, TV);
Chris@82 269 ST(&(x[WS(rs, 2)]), VADD(TP, TW), ms, &(x[0]));
Chris@82 270 ST(&(x[WS(rs, 8)]), VSUB(TW, TP), ms, &(x[0]));
Chris@82 271 }
Chris@82 272 }
Chris@82 273 }
Chris@82 274 }
Chris@82 275 VLEAVE();
Chris@82 276 }
Chris@82 277
Chris@82 278 static const tw_instr twinstr[] = {
Chris@82 279 VTW(0, 1),
Chris@82 280 VTW(0, 3),
Chris@82 281 VTW(0, 9),
Chris@82 282 {TW_NEXT, VL, 0}
Chris@82 283 };
Chris@82 284
Chris@82 285 static const ct_desc desc = { 10, XSIMD_STRING("t3bv_10"), twinstr, &GENUS, {51, 36, 6, 0}, 0, 0, 0 };
Chris@82 286
Chris@82 287 void XSIMD(codelet_t3bv_10) (planner *p) {
Chris@82 288 X(kdft_dit_register) (p, t3bv_10, &desc);
Chris@82 289 }
Chris@82 290 #endif