annotate src/fftw-3.3.5/dft/simd/common/t3bv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:45:01 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3bv_10 -include t3b.h -sign 1 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 57 FP additions, 52 FP multiplications,
Chris@42 32 * (or, 39 additions, 34 multiplications, 18 fused multiply/add),
Chris@42 33 * 57 stack variables, 4 constants, and 20 memory accesses
Chris@42 34 */
Chris@42 35 #include "t3b.h"
Chris@42 36
Chris@42 37 static void t3bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 R *x;
Chris@42 46 x = ii;
Chris@42 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@42 48 V T1, T7, Th, Tx, Tr, Td, Tp, T6, Tv, Tc, Te, Ti, Tl, T2, T3;
Chris@42 49 V T5;
Chris@42 50 T2 = LDW(&(W[0]));
Chris@42 51 T3 = LDW(&(W[TWVL * 2]));
Chris@42 52 T5 = LDW(&(W[TWVL * 4]));
Chris@42 53 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 54 T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 55 {
Chris@42 56 V To, Tw, Tq, Tu, Ta, T4, Tt, Tk, Tb;
Chris@42 57 To = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 58 Tw = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 59 Tq = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 60 Tu = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 61 Ta = VZMULJ(T2, T3);
Chris@42 62 T4 = VZMUL(T2, T3);
Chris@42 63 Th = VZMULJ(T2, T5);
Chris@42 64 Tt = VZMULJ(T3, T5);
Chris@42 65 Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 66 Tx = VZMUL(T2, Tw);
Chris@42 67 Tr = VZMUL(T5, Tq);
Chris@42 68 Tk = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 69 Td = VZMULJ(Ta, T5);
Chris@42 70 Tp = VZMUL(T4, To);
Chris@42 71 T6 = VZMULJ(T4, T5);
Chris@42 72 Tv = VZMUL(Tt, Tu);
Chris@42 73 Tc = VZMUL(Ta, Tb);
Chris@42 74 Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 75 Ti = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 76 Tl = VZMUL(T3, Tk);
Chris@42 77 }
Chris@42 78 {
Chris@42 79 V TN, Ts, T8, Ty, TO, Tf, Tj;
Chris@42 80 TN = VADD(Tp, Tr);
Chris@42 81 Ts = VSUB(Tp, Tr);
Chris@42 82 T8 = VZMUL(T6, T7);
Chris@42 83 Ty = VSUB(Tv, Tx);
Chris@42 84 TO = VADD(Tv, Tx);
Chris@42 85 Tf = VZMUL(Td, Te);
Chris@42 86 Tj = VZMUL(Th, Ti);
Chris@42 87 {
Chris@42 88 V T9, TJ, TP, TU, Tz, TF, Tg, TK, Tm, TL;
Chris@42 89 T9 = VSUB(T1, T8);
Chris@42 90 TJ = VADD(T1, T8);
Chris@42 91 TP = VADD(TN, TO);
Chris@42 92 TU = VSUB(TN, TO);
Chris@42 93 Tz = VADD(Ts, Ty);
Chris@42 94 TF = VSUB(Ts, Ty);
Chris@42 95 Tg = VSUB(Tc, Tf);
Chris@42 96 TK = VADD(Tc, Tf);
Chris@42 97 Tm = VSUB(Tj, Tl);
Chris@42 98 TL = VADD(Tj, Tl);
Chris@42 99 {
Chris@42 100 V TM, TV, Tn, TE;
Chris@42 101 TM = VADD(TK, TL);
Chris@42 102 TV = VSUB(TK, TL);
Chris@42 103 Tn = VADD(Tg, Tm);
Chris@42 104 TE = VSUB(Tg, Tm);
Chris@42 105 {
Chris@42 106 V TW, TY, TS, TQ, TG, TI, TC, TA, TR, TB;
Chris@42 107 TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TV, TU));
Chris@42 108 TY = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TU, TV));
Chris@42 109 TS = VSUB(TM, TP);
Chris@42 110 TQ = VADD(TM, TP);
Chris@42 111 TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TF, TE));
Chris@42 112 TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TE, TF));
Chris@42 113 TC = VSUB(Tn, Tz);
Chris@42 114 TA = VADD(Tn, Tz);
Chris@42 115 ST(&(x[0]), VADD(TJ, TQ), ms, &(x[0]));
Chris@42 116 TR = VFNMS(LDK(KP250000000), TQ, TJ);
Chris@42 117 ST(&(x[WS(rs, 5)]), VADD(T9, TA), ms, &(x[WS(rs, 1)]));
Chris@42 118 TB = VFNMS(LDK(KP250000000), TA, T9);
Chris@42 119 {
Chris@42 120 V TX, TT, TH, TD;
Chris@42 121 TX = VFMA(LDK(KP559016994), TS, TR);
Chris@42 122 TT = VFNMS(LDK(KP559016994), TS, TR);
Chris@42 123 TH = VFNMS(LDK(KP559016994), TC, TB);
Chris@42 124 TD = VFMA(LDK(KP559016994), TC, TB);
Chris@42 125 ST(&(x[WS(rs, 8)]), VFMAI(TW, TT), ms, &(x[0]));
Chris@42 126 ST(&(x[WS(rs, 2)]), VFNMSI(TW, TT), ms, &(x[0]));
Chris@42 127 ST(&(x[WS(rs, 6)]), VFMAI(TY, TX), ms, &(x[0]));
Chris@42 128 ST(&(x[WS(rs, 4)]), VFNMSI(TY, TX), ms, &(x[0]));
Chris@42 129 ST(&(x[WS(rs, 9)]), VFNMSI(TG, TD), ms, &(x[WS(rs, 1)]));
Chris@42 130 ST(&(x[WS(rs, 1)]), VFMAI(TG, TD), ms, &(x[WS(rs, 1)]));
Chris@42 131 ST(&(x[WS(rs, 7)]), VFNMSI(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@42 132 ST(&(x[WS(rs, 3)]), VFMAI(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@42 133 }
Chris@42 134 }
Chris@42 135 }
Chris@42 136 }
Chris@42 137 }
Chris@42 138 }
Chris@42 139 }
Chris@42 140 VLEAVE();
Chris@42 141 }
Chris@42 142
Chris@42 143 static const tw_instr twinstr[] = {
Chris@42 144 VTW(0, 1),
Chris@42 145 VTW(0, 3),
Chris@42 146 VTW(0, 9),
Chris@42 147 {TW_NEXT, VL, 0}
Chris@42 148 };
Chris@42 149
Chris@42 150 static const ct_desc desc = { 10, XSIMD_STRING("t3bv_10"), twinstr, &GENUS, {39, 34, 18, 0}, 0, 0, 0 };
Chris@42 151
Chris@42 152 void XSIMD(codelet_t3bv_10) (planner *p) {
Chris@42 153 X(kdft_dit_register) (p, t3bv_10, &desc);
Chris@42 154 }
Chris@42 155 #else /* HAVE_FMA */
Chris@42 156
Chris@42 157 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3bv_10 -include t3b.h -sign 1 */
Chris@42 158
Chris@42 159 /*
Chris@42 160 * This function contains 57 FP additions, 42 FP multiplications,
Chris@42 161 * (or, 51 additions, 36 multiplications, 6 fused multiply/add),
Chris@42 162 * 41 stack variables, 4 constants, and 20 memory accesses
Chris@42 163 */
Chris@42 164 #include "t3b.h"
Chris@42 165
Chris@42 166 static void t3bv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 167 {
Chris@42 168 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 169 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 170 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 171 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 172 {
Chris@42 173 INT m;
Chris@42 174 R *x;
Chris@42 175 x = ii;
Chris@42 176 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@42 177 V T1, T2, T3, Ti, T6, T7, TA, Tb, To;
Chris@42 178 T1 = LDW(&(W[0]));
Chris@42 179 T2 = LDW(&(W[TWVL * 2]));
Chris@42 180 T3 = VZMULJ(T1, T2);
Chris@42 181 Ti = VZMUL(T1, T2);
Chris@42 182 T6 = LDW(&(W[TWVL * 4]));
Chris@42 183 T7 = VZMULJ(T3, T6);
Chris@42 184 TA = VZMULJ(Ti, T6);
Chris@42 185 Tb = VZMULJ(T1, T6);
Chris@42 186 To = VZMULJ(T2, T6);
Chris@42 187 {
Chris@42 188 V TD, TQ, Tn, Tt, Tx, TM, TN, TS, Ta, Tg, Tw, TJ, TK, TR, Tz;
Chris@42 189 V TC, TB;
Chris@42 190 Tz = LD(&(x[0]), ms, &(x[0]));
Chris@42 191 TB = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 192 TC = VZMUL(TA, TB);
Chris@42 193 TD = VSUB(Tz, TC);
Chris@42 194 TQ = VADD(Tz, TC);
Chris@42 195 {
Chris@42 196 V Tk, Ts, Tm, Tq;
Chris@42 197 {
Chris@42 198 V Tj, Tr, Tl, Tp;
Chris@42 199 Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 200 Tk = VZMUL(Ti, Tj);
Chris@42 201 Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 202 Ts = VZMUL(T1, Tr);
Chris@42 203 Tl = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 204 Tm = VZMUL(T6, Tl);
Chris@42 205 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 206 Tq = VZMUL(To, Tp);
Chris@42 207 }
Chris@42 208 Tn = VSUB(Tk, Tm);
Chris@42 209 Tt = VSUB(Tq, Ts);
Chris@42 210 Tx = VADD(Tn, Tt);
Chris@42 211 TM = VADD(Tk, Tm);
Chris@42 212 TN = VADD(Tq, Ts);
Chris@42 213 TS = VADD(TM, TN);
Chris@42 214 }
Chris@42 215 {
Chris@42 216 V T5, Tf, T9, Td;
Chris@42 217 {
Chris@42 218 V T4, Te, T8, Tc;
Chris@42 219 T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 220 T5 = VZMUL(T3, T4);
Chris@42 221 Te = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 222 Tf = VZMUL(T2, Te);
Chris@42 223 T8 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 224 T9 = VZMUL(T7, T8);
Chris@42 225 Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 226 Td = VZMUL(Tb, Tc);
Chris@42 227 }
Chris@42 228 Ta = VSUB(T5, T9);
Chris@42 229 Tg = VSUB(Td, Tf);
Chris@42 230 Tw = VADD(Ta, Tg);
Chris@42 231 TJ = VADD(T5, T9);
Chris@42 232 TK = VADD(Td, Tf);
Chris@42 233 TR = VADD(TJ, TK);
Chris@42 234 }
Chris@42 235 {
Chris@42 236 V Ty, TE, TF, Tv, TI, Th, Tu, TH, TG;
Chris@42 237 Ty = VMUL(LDK(KP559016994), VSUB(Tw, Tx));
Chris@42 238 TE = VADD(Tw, Tx);
Chris@42 239 TF = VFNMS(LDK(KP250000000), TE, TD);
Chris@42 240 Th = VSUB(Ta, Tg);
Chris@42 241 Tu = VSUB(Tn, Tt);
Chris@42 242 Tv = VBYI(VFMA(LDK(KP951056516), Th, VMUL(LDK(KP587785252), Tu)));
Chris@42 243 TI = VBYI(VFNMS(LDK(KP951056516), Tu, VMUL(LDK(KP587785252), Th)));
Chris@42 244 ST(&(x[WS(rs, 5)]), VADD(TD, TE), ms, &(x[WS(rs, 1)]));
Chris@42 245 TH = VSUB(TF, Ty);
Chris@42 246 ST(&(x[WS(rs, 3)]), VSUB(TH, TI), ms, &(x[WS(rs, 1)]));
Chris@42 247 ST(&(x[WS(rs, 7)]), VADD(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@42 248 TG = VADD(Ty, TF);
Chris@42 249 ST(&(x[WS(rs, 1)]), VADD(Tv, TG), ms, &(x[WS(rs, 1)]));
Chris@42 250 ST(&(x[WS(rs, 9)]), VSUB(TG, Tv), ms, &(x[WS(rs, 1)]));
Chris@42 251 }
Chris@42 252 {
Chris@42 253 V TV, TT, TU, TP, TY, TL, TO, TX, TW;
Chris@42 254 TV = VMUL(LDK(KP559016994), VSUB(TR, TS));
Chris@42 255 TT = VADD(TR, TS);
Chris@42 256 TU = VFNMS(LDK(KP250000000), TT, TQ);
Chris@42 257 TL = VSUB(TJ, TK);
Chris@42 258 TO = VSUB(TM, TN);
Chris@42 259 TP = VBYI(VFNMS(LDK(KP951056516), TO, VMUL(LDK(KP587785252), TL)));
Chris@42 260 TY = VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TO)));
Chris@42 261 ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
Chris@42 262 TX = VADD(TV, TU);
Chris@42 263 ST(&(x[WS(rs, 4)]), VSUB(TX, TY), ms, &(x[0]));
Chris@42 264 ST(&(x[WS(rs, 6)]), VADD(TY, TX), ms, &(x[0]));
Chris@42 265 TW = VSUB(TU, TV);
Chris@42 266 ST(&(x[WS(rs, 2)]), VADD(TP, TW), ms, &(x[0]));
Chris@42 267 ST(&(x[WS(rs, 8)]), VSUB(TW, TP), ms, &(x[0]));
Chris@42 268 }
Chris@42 269 }
Chris@42 270 }
Chris@42 271 }
Chris@42 272 VLEAVE();
Chris@42 273 }
Chris@42 274
Chris@42 275 static const tw_instr twinstr[] = {
Chris@42 276 VTW(0, 1),
Chris@42 277 VTW(0, 3),
Chris@42 278 VTW(0, 9),
Chris@42 279 {TW_NEXT, VL, 0}
Chris@42 280 };
Chris@42 281
Chris@42 282 static const ct_desc desc = { 10, XSIMD_STRING("t3bv_10"), twinstr, &GENUS, {51, 36, 6, 0}, 0, 0, 0 };
Chris@42 283
Chris@42 284 void XSIMD(codelet_t3bv_10) (planner *p) {
Chris@42 285 X(kdft_dit_register) (p, t3bv_10, &desc);
Chris@42 286 }
Chris@42 287 #endif /* HAVE_FMA */