annotate src/fftw-3.3.5/dft/simd/common/t3fv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:44:01 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3fv_10 -include t3f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 57 FP additions, 52 FP multiplications,
Chris@42 32 * (or, 39 additions, 34 multiplications, 18 fused multiply/add),
Chris@42 33 * 57 stack variables, 4 constants, and 20 memory accesses
Chris@42 34 */
Chris@42 35 #include "t3f.h"
Chris@42 36
Chris@42 37 static void t3fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 R *x;
Chris@42 46 x = ri;
Chris@42 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@42 48 V T1, T7, Th, Tx, Tr, Td, Tp, T6, Tv, Tc, Te, Ti, Tl, T2, T3;
Chris@42 49 V T5;
Chris@42 50 T2 = LDW(&(W[0]));
Chris@42 51 T3 = LDW(&(W[TWVL * 2]));
Chris@42 52 T5 = LDW(&(W[TWVL * 4]));
Chris@42 53 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 54 T7 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 55 {
Chris@42 56 V To, Tw, Tq, Tu, Ta, T4, Tt, Tk, Tb;
Chris@42 57 To = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 58 Tw = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 59 Tq = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 60 Tu = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 61 Ta = VZMULJ(T2, T3);
Chris@42 62 T4 = VZMUL(T2, T3);
Chris@42 63 Th = VZMULJ(T2, T5);
Chris@42 64 Tt = VZMULJ(T3, T5);
Chris@42 65 Tb = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 66 Tx = VZMULJ(T2, Tw);
Chris@42 67 Tr = VZMULJ(T5, Tq);
Chris@42 68 Tk = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 69 Td = VZMULJ(Ta, T5);
Chris@42 70 Tp = VZMULJ(T4, To);
Chris@42 71 T6 = VZMULJ(T4, T5);
Chris@42 72 Tv = VZMULJ(Tt, Tu);
Chris@42 73 Tc = VZMULJ(Ta, Tb);
Chris@42 74 Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 75 Ti = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 76 Tl = VZMULJ(T3, Tk);
Chris@42 77 }
Chris@42 78 {
Chris@42 79 V TN, Ts, T8, Ty, TO, Tf, Tj;
Chris@42 80 TN = VADD(Tp, Tr);
Chris@42 81 Ts = VSUB(Tp, Tr);
Chris@42 82 T8 = VZMULJ(T6, T7);
Chris@42 83 Ty = VSUB(Tv, Tx);
Chris@42 84 TO = VADD(Tv, Tx);
Chris@42 85 Tf = VZMULJ(Td, Te);
Chris@42 86 Tj = VZMULJ(Th, Ti);
Chris@42 87 {
Chris@42 88 V T9, TJ, TP, TU, Tz, TF, Tg, TK, Tm, TL;
Chris@42 89 T9 = VSUB(T1, T8);
Chris@42 90 TJ = VADD(T1, T8);
Chris@42 91 TP = VADD(TN, TO);
Chris@42 92 TU = VSUB(TN, TO);
Chris@42 93 Tz = VADD(Ts, Ty);
Chris@42 94 TF = VSUB(Ts, Ty);
Chris@42 95 Tg = VSUB(Tc, Tf);
Chris@42 96 TK = VADD(Tc, Tf);
Chris@42 97 Tm = VSUB(Tj, Tl);
Chris@42 98 TL = VADD(Tj, Tl);
Chris@42 99 {
Chris@42 100 V TM, TV, Tn, TE;
Chris@42 101 TM = VADD(TK, TL);
Chris@42 102 TV = VSUB(TK, TL);
Chris@42 103 Tn = VADD(Tg, Tm);
Chris@42 104 TE = VSUB(Tg, Tm);
Chris@42 105 {
Chris@42 106 V TW, TY, TS, TQ, TG, TI, TC, TA, TR, TB;
Chris@42 107 TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TV, TU));
Chris@42 108 TY = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TU, TV));
Chris@42 109 TS = VSUB(TM, TP);
Chris@42 110 TQ = VADD(TM, TP);
Chris@42 111 TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TF, TE));
Chris@42 112 TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TE, TF));
Chris@42 113 TC = VSUB(Tn, Tz);
Chris@42 114 TA = VADD(Tn, Tz);
Chris@42 115 ST(&(x[0]), VADD(TJ, TQ), ms, &(x[0]));
Chris@42 116 TR = VFNMS(LDK(KP250000000), TQ, TJ);
Chris@42 117 ST(&(x[WS(rs, 5)]), VADD(T9, TA), ms, &(x[WS(rs, 1)]));
Chris@42 118 TB = VFNMS(LDK(KP250000000), TA, T9);
Chris@42 119 {
Chris@42 120 V TX, TT, TH, TD;
Chris@42 121 TX = VFMA(LDK(KP559016994), TS, TR);
Chris@42 122 TT = VFNMS(LDK(KP559016994), TS, TR);
Chris@42 123 TH = VFNMS(LDK(KP559016994), TC, TB);
Chris@42 124 TD = VFMA(LDK(KP559016994), TC, TB);
Chris@42 125 ST(&(x[WS(rs, 8)]), VFNMSI(TW, TT), ms, &(x[0]));
Chris@42 126 ST(&(x[WS(rs, 2)]), VFMAI(TW, TT), ms, &(x[0]));
Chris@42 127 ST(&(x[WS(rs, 6)]), VFNMSI(TY, TX), ms, &(x[0]));
Chris@42 128 ST(&(x[WS(rs, 4)]), VFMAI(TY, TX), ms, &(x[0]));
Chris@42 129 ST(&(x[WS(rs, 9)]), VFMAI(TG, TD), ms, &(x[WS(rs, 1)]));
Chris@42 130 ST(&(x[WS(rs, 1)]), VFNMSI(TG, TD), ms, &(x[WS(rs, 1)]));
Chris@42 131 ST(&(x[WS(rs, 7)]), VFMAI(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@42 132 ST(&(x[WS(rs, 3)]), VFNMSI(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@42 133 }
Chris@42 134 }
Chris@42 135 }
Chris@42 136 }
Chris@42 137 }
Chris@42 138 }
Chris@42 139 }
Chris@42 140 VLEAVE();
Chris@42 141 }
Chris@42 142
Chris@42 143 static const tw_instr twinstr[] = {
Chris@42 144 VTW(0, 1),
Chris@42 145 VTW(0, 3),
Chris@42 146 VTW(0, 9),
Chris@42 147 {TW_NEXT, VL, 0}
Chris@42 148 };
Chris@42 149
Chris@42 150 static const ct_desc desc = { 10, XSIMD_STRING("t3fv_10"), twinstr, &GENUS, {39, 34, 18, 0}, 0, 0, 0 };
Chris@42 151
Chris@42 152 void XSIMD(codelet_t3fv_10) (planner *p) {
Chris@42 153 X(kdft_dit_register) (p, t3fv_10, &desc);
Chris@42 154 }
Chris@42 155 #else /* HAVE_FMA */
Chris@42 156
Chris@42 157 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -no-generate-bytw -n 10 -name t3fv_10 -include t3f.h */
Chris@42 158
Chris@42 159 /*
Chris@42 160 * This function contains 57 FP additions, 42 FP multiplications,
Chris@42 161 * (or, 51 additions, 36 multiplications, 6 fused multiply/add),
Chris@42 162 * 41 stack variables, 4 constants, and 20 memory accesses
Chris@42 163 */
Chris@42 164 #include "t3f.h"
Chris@42 165
Chris@42 166 static void t3fv_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 167 {
Chris@42 168 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 169 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 170 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 171 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 172 {
Chris@42 173 INT m;
Chris@42 174 R *x;
Chris@42 175 x = ri;
Chris@42 176 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(10, rs)) {
Chris@42 177 V T1, T2, T3, Ti, T6, T7, Tx, Tb, To;
Chris@42 178 T1 = LDW(&(W[0]));
Chris@42 179 T2 = LDW(&(W[TWVL * 2]));
Chris@42 180 T3 = VZMULJ(T1, T2);
Chris@42 181 Ti = VZMUL(T1, T2);
Chris@42 182 T6 = LDW(&(W[TWVL * 4]));
Chris@42 183 T7 = VZMULJ(T3, T6);
Chris@42 184 Tx = VZMULJ(Ti, T6);
Chris@42 185 Tb = VZMULJ(T1, T6);
Chris@42 186 To = VZMULJ(T2, T6);
Chris@42 187 {
Chris@42 188 V TA, TQ, Tn, Tt, Tu, TJ, TK, TS, Ta, Tg, Th, TM, TN, TR, Tw;
Chris@42 189 V Tz, Ty;
Chris@42 190 Tw = LD(&(x[0]), ms, &(x[0]));
Chris@42 191 Ty = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 192 Tz = VZMULJ(Tx, Ty);
Chris@42 193 TA = VSUB(Tw, Tz);
Chris@42 194 TQ = VADD(Tw, Tz);
Chris@42 195 {
Chris@42 196 V Tk, Ts, Tm, Tq;
Chris@42 197 {
Chris@42 198 V Tj, Tr, Tl, Tp;
Chris@42 199 Tj = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 200 Tk = VZMULJ(Ti, Tj);
Chris@42 201 Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 202 Ts = VZMULJ(T1, Tr);
Chris@42 203 Tl = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 204 Tm = VZMULJ(T6, Tl);
Chris@42 205 Tp = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 206 Tq = VZMULJ(To, Tp);
Chris@42 207 }
Chris@42 208 Tn = VSUB(Tk, Tm);
Chris@42 209 Tt = VSUB(Tq, Ts);
Chris@42 210 Tu = VADD(Tn, Tt);
Chris@42 211 TJ = VADD(Tk, Tm);
Chris@42 212 TK = VADD(Tq, Ts);
Chris@42 213 TS = VADD(TJ, TK);
Chris@42 214 }
Chris@42 215 {
Chris@42 216 V T5, Tf, T9, Td;
Chris@42 217 {
Chris@42 218 V T4, Te, T8, Tc;
Chris@42 219 T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 220 T5 = VZMULJ(T3, T4);
Chris@42 221 Te = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 222 Tf = VZMULJ(T2, Te);
Chris@42 223 T8 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 224 T9 = VZMULJ(T7, T8);
Chris@42 225 Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 226 Td = VZMULJ(Tb, Tc);
Chris@42 227 }
Chris@42 228 Ta = VSUB(T5, T9);
Chris@42 229 Tg = VSUB(Td, Tf);
Chris@42 230 Th = VADD(Ta, Tg);
Chris@42 231 TM = VADD(T5, T9);
Chris@42 232 TN = VADD(Td, Tf);
Chris@42 233 TR = VADD(TM, TN);
Chris@42 234 }
Chris@42 235 {
Chris@42 236 V Tv, TB, TC, TG, TI, TE, TF, TH, TD;
Chris@42 237 Tv = VMUL(LDK(KP559016994), VSUB(Th, Tu));
Chris@42 238 TB = VADD(Th, Tu);
Chris@42 239 TC = VFNMS(LDK(KP250000000), TB, TA);
Chris@42 240 TE = VSUB(Ta, Tg);
Chris@42 241 TF = VSUB(Tn, Tt);
Chris@42 242 TG = VBYI(VFMA(LDK(KP951056516), TE, VMUL(LDK(KP587785252), TF)));
Chris@42 243 TI = VBYI(VFNMS(LDK(KP587785252), TE, VMUL(LDK(KP951056516), TF)));
Chris@42 244 ST(&(x[WS(rs, 5)]), VADD(TA, TB), ms, &(x[WS(rs, 1)]));
Chris@42 245 TH = VSUB(TC, Tv);
Chris@42 246 ST(&(x[WS(rs, 3)]), VSUB(TH, TI), ms, &(x[WS(rs, 1)]));
Chris@42 247 ST(&(x[WS(rs, 7)]), VADD(TI, TH), ms, &(x[WS(rs, 1)]));
Chris@42 248 TD = VADD(Tv, TC);
Chris@42 249 ST(&(x[WS(rs, 1)]), VSUB(TD, TG), ms, &(x[WS(rs, 1)]));
Chris@42 250 ST(&(x[WS(rs, 9)]), VADD(TG, TD), ms, &(x[WS(rs, 1)]));
Chris@42 251 }
Chris@42 252 {
Chris@42 253 V TV, TT, TU, TP, TX, TL, TO, TY, TW;
Chris@42 254 TV = VMUL(LDK(KP559016994), VSUB(TR, TS));
Chris@42 255 TT = VADD(TR, TS);
Chris@42 256 TU = VFNMS(LDK(KP250000000), TT, TQ);
Chris@42 257 TL = VSUB(TJ, TK);
Chris@42 258 TO = VSUB(TM, TN);
Chris@42 259 TP = VBYI(VFNMS(LDK(KP587785252), TO, VMUL(LDK(KP951056516), TL)));
Chris@42 260 TX = VBYI(VFMA(LDK(KP951056516), TO, VMUL(LDK(KP587785252), TL)));
Chris@42 261 ST(&(x[0]), VADD(TQ, TT), ms, &(x[0]));
Chris@42 262 TY = VADD(TV, TU);
Chris@42 263 ST(&(x[WS(rs, 4)]), VADD(TX, TY), ms, &(x[0]));
Chris@42 264 ST(&(x[WS(rs, 6)]), VSUB(TY, TX), ms, &(x[0]));
Chris@42 265 TW = VSUB(TU, TV);
Chris@42 266 ST(&(x[WS(rs, 2)]), VADD(TP, TW), ms, &(x[0]));
Chris@42 267 ST(&(x[WS(rs, 8)]), VSUB(TW, TP), ms, &(x[0]));
Chris@42 268 }
Chris@42 269 }
Chris@42 270 }
Chris@42 271 }
Chris@42 272 VLEAVE();
Chris@42 273 }
Chris@42 274
Chris@42 275 static const tw_instr twinstr[] = {
Chris@42 276 VTW(0, 1),
Chris@42 277 VTW(0, 3),
Chris@42 278 VTW(0, 9),
Chris@42 279 {TW_NEXT, VL, 0}
Chris@42 280 };
Chris@42 281
Chris@42 282 static const ct_desc desc = { 10, XSIMD_STRING("t3fv_10"), twinstr, &GENUS, {51, 36, 6, 0}, 0, 0, 0 };
Chris@42 283
Chris@42 284 void XSIMD(codelet_t3fv_10) (planner *p) {
Chris@42 285 X(kdft_dit_register) (p, t3fv_10, &desc);
Chris@42 286 }
Chris@42 287 #endif /* HAVE_FMA */