annotate src/fftw-3.3.5/dft/simd/common/t1fv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:42:00 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1fv_10 -include t1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 51 FP additions, 40 FP multiplications,
Chris@42 32 * (or, 33 additions, 22 multiplications, 18 fused multiply/add),
Chris@42 33 * 43 stack variables, 4 constants, and 20 memory accesses
Chris@42 34 */
Chris@42 35 #include "t1f.h"
Chris@42 36
Chris@42 37 static void t1fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 R *x;
Chris@42 46 x = ri;
Chris@42 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@42 48 V Td, TA, T4, Ta, Tk, TE, Tp, TF, TB, T9, T1, T2, Tb;
Chris@42 49 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 50 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 51 {
Chris@42 52 V Tg, Tn, Ti, Tl;
Chris@42 53 Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 54 Tn = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 55 Ti = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 56 Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 57 {
Chris@42 58 V T6, T8, T5, Tc;
Chris@42 59 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 60 Tc = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 61 {
Chris@42 62 V T3, Th, To, Tj, Tm, T7;
Chris@42 63 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 64 T3 = BYTWJ(&(W[TWVL * 8]), T2);
Chris@42 65 Th = BYTWJ(&(W[TWVL * 6]), Tg);
Chris@42 66 To = BYTWJ(&(W[0]), Tn);
Chris@42 67 Tj = BYTWJ(&(W[TWVL * 16]), Ti);
Chris@42 68 Tm = BYTWJ(&(W[TWVL * 10]), Tl);
Chris@42 69 T6 = BYTWJ(&(W[TWVL * 2]), T5);
Chris@42 70 Td = BYTWJ(&(W[TWVL * 4]), Tc);
Chris@42 71 T8 = BYTWJ(&(W[TWVL * 12]), T7);
Chris@42 72 TA = VADD(T1, T3);
Chris@42 73 T4 = VSUB(T1, T3);
Chris@42 74 Ta = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 75 Tk = VSUB(Th, Tj);
Chris@42 76 TE = VADD(Th, Tj);
Chris@42 77 Tp = VSUB(Tm, To);
Chris@42 78 TF = VADD(Tm, To);
Chris@42 79 }
Chris@42 80 TB = VADD(T6, T8);
Chris@42 81 T9 = VSUB(T6, T8);
Chris@42 82 }
Chris@42 83 }
Chris@42 84 Tb = BYTWJ(&(W[TWVL * 14]), Ta);
Chris@42 85 {
Chris@42 86 V TL, TG, Tw, Tq, TC, Te;
Chris@42 87 TL = VSUB(TE, TF);
Chris@42 88 TG = VADD(TE, TF);
Chris@42 89 Tw = VSUB(Tk, Tp);
Chris@42 90 Tq = VADD(Tk, Tp);
Chris@42 91 TC = VADD(Tb, Td);
Chris@42 92 Te = VSUB(Tb, Td);
Chris@42 93 {
Chris@42 94 V TM, TD, Tv, Tf;
Chris@42 95 TM = VSUB(TB, TC);
Chris@42 96 TD = VADD(TB, TC);
Chris@42 97 Tv = VSUB(T9, Te);
Chris@42 98 Tf = VADD(T9, Te);
Chris@42 99 {
Chris@42 100 V TP, TN, TH, TJ, Tz, Tx, Tr, Tt, TI, Ts;
Chris@42 101 TP = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TL, TM));
Chris@42 102 TN = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TM, TL));
Chris@42 103 TH = VADD(TD, TG);
Chris@42 104 TJ = VSUB(TD, TG);
Chris@42 105 Tz = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tv, Tw));
Chris@42 106 Tx = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tw, Tv));
Chris@42 107 Tr = VADD(Tf, Tq);
Chris@42 108 Tt = VSUB(Tf, Tq);
Chris@42 109 ST(&(x[0]), VADD(TA, TH), ms, &(x[0]));
Chris@42 110 TI = VFNMS(LDK(KP250000000), TH, TA);
Chris@42 111 ST(&(x[WS(rs, 5)]), VADD(T4, Tr), ms, &(x[WS(rs, 1)]));
Chris@42 112 Ts = VFNMS(LDK(KP250000000), Tr, T4);
Chris@42 113 {
Chris@42 114 V TK, TO, Tu, Ty;
Chris@42 115 TK = VFNMS(LDK(KP559016994), TJ, TI);
Chris@42 116 TO = VFMA(LDK(KP559016994), TJ, TI);
Chris@42 117 Tu = VFMA(LDK(KP559016994), Tt, Ts);
Chris@42 118 Ty = VFNMS(LDK(KP559016994), Tt, Ts);
Chris@42 119 ST(&(x[WS(rs, 8)]), VFNMSI(TN, TK), ms, &(x[0]));
Chris@42 120 ST(&(x[WS(rs, 2)]), VFMAI(TN, TK), ms, &(x[0]));
Chris@42 121 ST(&(x[WS(rs, 6)]), VFNMSI(TP, TO), ms, &(x[0]));
Chris@42 122 ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
Chris@42 123 ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tu), ms, &(x[WS(rs, 1)]));
Chris@42 124 ST(&(x[WS(rs, 1)]), VFNMSI(Tx, Tu), ms, &(x[WS(rs, 1)]));
Chris@42 125 ST(&(x[WS(rs, 7)]), VFMAI(Tz, Ty), ms, &(x[WS(rs, 1)]));
Chris@42 126 ST(&(x[WS(rs, 3)]), VFNMSI(Tz, Ty), ms, &(x[WS(rs, 1)]));
Chris@42 127 }
Chris@42 128 }
Chris@42 129 }
Chris@42 130 }
Chris@42 131 }
Chris@42 132 }
Chris@42 133 VLEAVE();
Chris@42 134 }
Chris@42 135
Chris@42 136 static const tw_instr twinstr[] = {
Chris@42 137 VTW(0, 1),
Chris@42 138 VTW(0, 2),
Chris@42 139 VTW(0, 3),
Chris@42 140 VTW(0, 4),
Chris@42 141 VTW(0, 5),
Chris@42 142 VTW(0, 6),
Chris@42 143 VTW(0, 7),
Chris@42 144 VTW(0, 8),
Chris@42 145 VTW(0, 9),
Chris@42 146 {TW_NEXT, VL, 0}
Chris@42 147 };
Chris@42 148
Chris@42 149 static const ct_desc desc = { 10, XSIMD_STRING("t1fv_10"), twinstr, &GENUS, {33, 22, 18, 0}, 0, 0, 0 };
Chris@42 150
Chris@42 151 void XSIMD(codelet_t1fv_10) (planner *p) {
Chris@42 152 X(kdft_dit_register) (p, t1fv_10, &desc);
Chris@42 153 }
Chris@42 154 #else /* HAVE_FMA */
Chris@42 155
Chris@42 156 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name t1fv_10 -include t1f.h */
Chris@42 157
Chris@42 158 /*
Chris@42 159 * This function contains 51 FP additions, 30 FP multiplications,
Chris@42 160 * (or, 45 additions, 24 multiplications, 6 fused multiply/add),
Chris@42 161 * 32 stack variables, 4 constants, and 20 memory accesses
Chris@42 162 */
Chris@42 163 #include "t1f.h"
Chris@42 164
Chris@42 165 static void t1fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 166 {
Chris@42 167 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 168 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 169 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 170 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 171 {
Chris@42 172 INT m;
Chris@42 173 R *x;
Chris@42 174 x = ri;
Chris@42 175 for (m = mb, W = W + (mb * ((TWVL / VL) * 18)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@42 176 V Tr, TH, Tg, Tl, Tm, TA, TB, TJ, T5, Ta, Tb, TD, TE, TI, To;
Chris@42 177 V Tq, Tp;
Chris@42 178 To = LD(&(x[0]), ms, &(x[0]));
Chris@42 179 Tp = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 180 Tq = BYTWJ(&(W[TWVL * 8]), Tp);
Chris@42 181 Tr = VSUB(To, Tq);
Chris@42 182 TH = VADD(To, Tq);
Chris@42 183 {
Chris@42 184 V Td, Tk, Tf, Ti;
Chris@42 185 {
Chris@42 186 V Tc, Tj, Te, Th;
Chris@42 187 Tc = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 188 Td = BYTWJ(&(W[TWVL * 6]), Tc);
Chris@42 189 Tj = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 190 Tk = BYTWJ(&(W[0]), Tj);
Chris@42 191 Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 192 Tf = BYTWJ(&(W[TWVL * 16]), Te);
Chris@42 193 Th = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 194 Ti = BYTWJ(&(W[TWVL * 10]), Th);
Chris@42 195 }
Chris@42 196 Tg = VSUB(Td, Tf);
Chris@42 197 Tl = VSUB(Ti, Tk);
Chris@42 198 Tm = VADD(Tg, Tl);
Chris@42 199 TA = VADD(Td, Tf);
Chris@42 200 TB = VADD(Ti, Tk);
Chris@42 201 TJ = VADD(TA, TB);
Chris@42 202 }
Chris@42 203 {
Chris@42 204 V T2, T9, T4, T7;
Chris@42 205 {
Chris@42 206 V T1, T8, T3, T6;
Chris@42 207 T1 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 208 T2 = BYTWJ(&(W[TWVL * 2]), T1);
Chris@42 209 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 210 T9 = BYTWJ(&(W[TWVL * 4]), T8);
Chris@42 211 T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 212 T4 = BYTWJ(&(W[TWVL * 12]), T3);
Chris@42 213 T6 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 214 T7 = BYTWJ(&(W[TWVL * 14]), T6);
Chris@42 215 }
Chris@42 216 T5 = VSUB(T2, T4);
Chris@42 217 Ta = VSUB(T7, T9);
Chris@42 218 Tb = VADD(T5, Ta);
Chris@42 219 TD = VADD(T2, T4);
Chris@42 220 TE = VADD(T7, T9);
Chris@42 221 TI = VADD(TD, TE);
Chris@42 222 }
Chris@42 223 {
Chris@42 224 V Tn, Ts, Tt, Tx, Tz, Tv, Tw, Ty, Tu;
Chris@42 225 Tn = VMUL(LDK(KP559016994), VSUB(Tb, Tm));
Chris@42 226 Ts = VADD(Tb, Tm);
Chris@42 227 Tt = VFNMS(LDK(KP250000000), Ts, Tr);
Chris@42 228 Tv = VSUB(T5, Ta);
Chris@42 229 Tw = VSUB(Tg, Tl);
Chris@42 230 Tx = VBYI(VFMA(LDK(KP951056516), Tv, VMUL(LDK(KP587785252), Tw)));
Chris@42 231 Tz = VBYI(VFNMS(LDK(KP587785252), Tv, VMUL(LDK(KP951056516), Tw)));
Chris@42 232 ST(&(x[WS(rs, 5)]), VADD(Tr, Ts), ms, &(x[WS(rs, 1)]));
Chris@42 233 Ty = VSUB(Tt, Tn);
Chris@42 234 ST(&(x[WS(rs, 3)]), VSUB(Ty, Tz), ms, &(x[WS(rs, 1)]));
Chris@42 235 ST(&(x[WS(rs, 7)]), VADD(Tz, Ty), ms, &(x[WS(rs, 1)]));
Chris@42 236 Tu = VADD(Tn, Tt);
Chris@42 237 ST(&(x[WS(rs, 1)]), VSUB(Tu, Tx), ms, &(x[WS(rs, 1)]));
Chris@42 238 ST(&(x[WS(rs, 9)]), VADD(Tx, Tu), ms, &(x[WS(rs, 1)]));
Chris@42 239 }
Chris@42 240 {
Chris@42 241 V TM, TK, TL, TG, TO, TC, TF, TP, TN;
Chris@42 242 TM = VMUL(LDK(KP559016994), VSUB(TI, TJ));
Chris@42 243 TK = VADD(TI, TJ);
Chris@42 244 TL = VFNMS(LDK(KP250000000), TK, TH);
Chris@42 245 TC = VSUB(TA, TB);
Chris@42 246 TF = VSUB(TD, TE);
Chris@42 247 TG = VBYI(VFNMS(LDK(KP587785252), TF, VMUL(LDK(KP951056516), TC)));
Chris@42 248 TO = VBYI(VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TC)));
Chris@42 249 ST(&(x[0]), VADD(TH, TK), ms, &(x[0]));
Chris@42 250 TP = VADD(TM, TL);
Chris@42 251 ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
Chris@42 252 ST(&(x[WS(rs, 6)]), VSUB(TP, TO), ms, &(x[0]));
Chris@42 253 TN = VSUB(TL, TM);
Chris@42 254 ST(&(x[WS(rs, 2)]), VADD(TG, TN), ms, &(x[0]));
Chris@42 255 ST(&(x[WS(rs, 8)]), VSUB(TN, TG), ms, &(x[0]));
Chris@42 256 }
Chris@42 257 }
Chris@42 258 }
Chris@42 259 VLEAVE();
Chris@42 260 }
Chris@42 261
Chris@42 262 static const tw_instr twinstr[] = {
Chris@42 263 VTW(0, 1),
Chris@42 264 VTW(0, 2),
Chris@42 265 VTW(0, 3),
Chris@42 266 VTW(0, 4),
Chris@42 267 VTW(0, 5),
Chris@42 268 VTW(0, 6),
Chris@42 269 VTW(0, 7),
Chris@42 270 VTW(0, 8),
Chris@42 271 VTW(0, 9),
Chris@42 272 {TW_NEXT, VL, 0}
Chris@42 273 };
Chris@42 274
Chris@42 275 static const ct_desc desc = { 10, XSIMD_STRING("t1fv_10"), twinstr, &GENUS, {45, 24, 6, 0}, 0, 0, 0 };
Chris@42 276
Chris@42 277 void XSIMD(codelet_t1fv_10) (planner *p) {
Chris@42 278 X(kdft_dit_register) (p, t1fv_10, &desc);
Chris@42 279 }
Chris@42 280 #endif /* HAVE_FMA */