annotate src/fftw-3.3.3/dft/simd/common/t1bv_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:04 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1bv_12 -include t1b.h -sign 1 */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 59 FP additions, 42 FP multiplications,
Chris@10 32 * (or, 41 additions, 24 multiplications, 18 fused multiply/add),
Chris@10 33 * 41 stack variables, 2 constants, and 24 memory accesses
Chris@10 34 */
Chris@10 35 #include "t1b.h"
Chris@10 36
Chris@10 37 static void t1bv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 41 {
Chris@10 42 INT m;
Chris@10 43 R *x;
Chris@10 44 x = ii;
Chris@10 45 for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
Chris@10 46 V TI, Ti, TA, T7, Tm, TE, Tw, Tk, Tf, TB, TU, TM;
Chris@10 47 {
Chris@10 48 V T9, TK, Tj, TL, Te;
Chris@10 49 {
Chris@10 50 V T1, T4, T2, Tp, Tt, Tr;
Chris@10 51 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 52 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 53 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 54 Tp = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 55 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 56 Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 57 {
Chris@10 58 V T5, T3, Tq, Tu, Ts, Td, Tb, T8, Tc, Ta;
Chris@10 59 T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 60 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 61 Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 62 T5 = BYTW(&(W[TWVL * 14]), T4);
Chris@10 63 T3 = BYTW(&(W[TWVL * 6]), T2);
Chris@10 64 Tq = BYTW(&(W[TWVL * 16]), Tp);
Chris@10 65 Tu = BYTW(&(W[TWVL * 8]), Tt);
Chris@10 66 Ts = BYTW(&(W[0]), Tr);
Chris@10 67 T9 = BYTW(&(W[TWVL * 10]), T8);
Chris@10 68 Td = BYTW(&(W[TWVL * 2]), Tc);
Chris@10 69 Tb = BYTW(&(W[TWVL * 18]), Ta);
Chris@10 70 {
Chris@10 71 V Th, T6, Tl, Tv;
Chris@10 72 Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 73 TK = VSUB(T3, T5);
Chris@10 74 T6 = VADD(T3, T5);
Chris@10 75 Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 76 Tv = VADD(Ts, Tu);
Chris@10 77 TI = VSUB(Tu, Ts);
Chris@10 78 Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 79 TL = VSUB(Tb, Td);
Chris@10 80 Te = VADD(Tb, Td);
Chris@10 81 Ti = BYTW(&(W[TWVL * 4]), Th);
Chris@10 82 TA = VFNMS(LDK(KP500000000), T6, T1);
Chris@10 83 T7 = VADD(T1, T6);
Chris@10 84 Tm = BYTW(&(W[TWVL * 20]), Tl);
Chris@10 85 TE = VFNMS(LDK(KP500000000), Tv, Tq);
Chris@10 86 Tw = VADD(Tq, Tv);
Chris@10 87 }
Chris@10 88 }
Chris@10 89 }
Chris@10 90 Tk = BYTW(&(W[TWVL * 12]), Tj);
Chris@10 91 Tf = VADD(T9, Te);
Chris@10 92 TB = VFNMS(LDK(KP500000000), Te, T9);
Chris@10 93 TU = VSUB(TK, TL);
Chris@10 94 TM = VADD(TK, TL);
Chris@10 95 }
Chris@10 96 {
Chris@10 97 V Tn, TH, TC, TQ, Ty, Tg;
Chris@10 98 Tn = VADD(Tk, Tm);
Chris@10 99 TH = VSUB(Tk, Tm);
Chris@10 100 TC = VADD(TA, TB);
Chris@10 101 TQ = VSUB(TA, TB);
Chris@10 102 Ty = VADD(T7, Tf);
Chris@10 103 Tg = VSUB(T7, Tf);
Chris@10 104 {
Chris@10 105 V To, TD, TJ, TR;
Chris@10 106 To = VADD(Ti, Tn);
Chris@10 107 TD = VFNMS(LDK(KP500000000), Tn, Ti);
Chris@10 108 TJ = VSUB(TH, TI);
Chris@10 109 TR = VADD(TH, TI);
Chris@10 110 {
Chris@10 111 V TP, TN, TW, TS, TO, TG, TX, TV;
Chris@10 112 {
Chris@10 113 V Tz, Tx, TF, TT;
Chris@10 114 Tz = VADD(To, Tw);
Chris@10 115 Tx = VSUB(To, Tw);
Chris@10 116 TF = VADD(TD, TE);
Chris@10 117 TT = VSUB(TD, TE);
Chris@10 118 TP = VMUL(LDK(KP866025403), VADD(TM, TJ));
Chris@10 119 TN = VMUL(LDK(KP866025403), VSUB(TJ, TM));
Chris@10 120 TW = VFMA(LDK(KP866025403), TR, TQ);
Chris@10 121 TS = VFNMS(LDK(KP866025403), TR, TQ);
Chris@10 122 ST(&(x[WS(rs, 6)]), VSUB(Ty, Tz), ms, &(x[0]));
Chris@10 123 ST(&(x[0]), VADD(Ty, Tz), ms, &(x[0]));
Chris@10 124 ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tg), ms, &(x[WS(rs, 1)]));
Chris@10 125 ST(&(x[WS(rs, 3)]), VFNMSI(Tx, Tg), ms, &(x[WS(rs, 1)]));
Chris@10 126 TO = VADD(TC, TF);
Chris@10 127 TG = VSUB(TC, TF);
Chris@10 128 TX = VFNMS(LDK(KP866025403), TU, TT);
Chris@10 129 TV = VFMA(LDK(KP866025403), TU, TT);
Chris@10 130 }
Chris@10 131 ST(&(x[WS(rs, 8)]), VFNMSI(TP, TO), ms, &(x[0]));
Chris@10 132 ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
Chris@10 133 ST(&(x[WS(rs, 2)]), VFMAI(TN, TG), ms, &(x[0]));
Chris@10 134 ST(&(x[WS(rs, 10)]), VFNMSI(TN, TG), ms, &(x[0]));
Chris@10 135 ST(&(x[WS(rs, 5)]), VFMAI(TX, TW), ms, &(x[WS(rs, 1)]));
Chris@10 136 ST(&(x[WS(rs, 7)]), VFNMSI(TX, TW), ms, &(x[WS(rs, 1)]));
Chris@10 137 ST(&(x[WS(rs, 11)]), VFNMSI(TV, TS), ms, &(x[WS(rs, 1)]));
Chris@10 138 ST(&(x[WS(rs, 1)]), VFMAI(TV, TS), ms, &(x[WS(rs, 1)]));
Chris@10 139 }
Chris@10 140 }
Chris@10 141 }
Chris@10 142 }
Chris@10 143 }
Chris@10 144 VLEAVE();
Chris@10 145 }
Chris@10 146
Chris@10 147 static const tw_instr twinstr[] = {
Chris@10 148 VTW(0, 1),
Chris@10 149 VTW(0, 2),
Chris@10 150 VTW(0, 3),
Chris@10 151 VTW(0, 4),
Chris@10 152 VTW(0, 5),
Chris@10 153 VTW(0, 6),
Chris@10 154 VTW(0, 7),
Chris@10 155 VTW(0, 8),
Chris@10 156 VTW(0, 9),
Chris@10 157 VTW(0, 10),
Chris@10 158 VTW(0, 11),
Chris@10 159 {TW_NEXT, VL, 0}
Chris@10 160 };
Chris@10 161
Chris@10 162 static const ct_desc desc = { 12, XSIMD_STRING("t1bv_12"), twinstr, &GENUS, {41, 24, 18, 0}, 0, 0, 0 };
Chris@10 163
Chris@10 164 void XSIMD(codelet_t1bv_12) (planner *p) {
Chris@10 165 X(kdft_dit_register) (p, t1bv_12, &desc);
Chris@10 166 }
Chris@10 167 #else /* HAVE_FMA */
Chris@10 168
Chris@10 169 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1bv_12 -include t1b.h -sign 1 */
Chris@10 170
Chris@10 171 /*
Chris@10 172 * This function contains 59 FP additions, 30 FP multiplications,
Chris@10 173 * (or, 55 additions, 26 multiplications, 4 fused multiply/add),
Chris@10 174 * 28 stack variables, 2 constants, and 24 memory accesses
Chris@10 175 */
Chris@10 176 #include "t1b.h"
Chris@10 177
Chris@10 178 static void t1bv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 179 {
Chris@10 180 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 181 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 182 {
Chris@10 183 INT m;
Chris@10 184 R *x;
Chris@10 185 x = ii;
Chris@10 186 for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
Chris@10 187 V T1, Tt, T6, T7, TB, Tq, TC, TD, T9, Tu, Te, Tf, Tx, Tl, Ty;
Chris@10 188 V Tz;
Chris@10 189 {
Chris@10 190 V T5, T3, T4, T2;
Chris@10 191 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 192 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 193 T5 = BYTW(&(W[TWVL * 14]), T4);
Chris@10 194 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 195 T3 = BYTW(&(W[TWVL * 6]), T2);
Chris@10 196 Tt = VSUB(T3, T5);
Chris@10 197 T6 = VADD(T3, T5);
Chris@10 198 T7 = VFNMS(LDK(KP500000000), T6, T1);
Chris@10 199 }
Chris@10 200 {
Chris@10 201 V Tn, Tp, Tm, TA, To;
Chris@10 202 Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 203 Tn = BYTW(&(W[0]), Tm);
Chris@10 204 TA = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 205 TB = BYTW(&(W[TWVL * 16]), TA);
Chris@10 206 To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 207 Tp = BYTW(&(W[TWVL * 8]), To);
Chris@10 208 Tq = VSUB(Tn, Tp);
Chris@10 209 TC = VADD(Tn, Tp);
Chris@10 210 TD = VFNMS(LDK(KP500000000), TC, TB);
Chris@10 211 }
Chris@10 212 {
Chris@10 213 V Td, Tb, T8, Tc, Ta;
Chris@10 214 T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 215 T9 = BYTW(&(W[TWVL * 10]), T8);
Chris@10 216 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 217 Td = BYTW(&(W[TWVL * 2]), Tc);
Chris@10 218 Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 219 Tb = BYTW(&(W[TWVL * 18]), Ta);
Chris@10 220 Tu = VSUB(Tb, Td);
Chris@10 221 Te = VADD(Tb, Td);
Chris@10 222 Tf = VFNMS(LDK(KP500000000), Te, T9);
Chris@10 223 }
Chris@10 224 {
Chris@10 225 V Ti, Tk, Th, Tw, Tj;
Chris@10 226 Th = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 227 Ti = BYTW(&(W[TWVL * 12]), Th);
Chris@10 228 Tw = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 229 Tx = BYTW(&(W[TWVL * 4]), Tw);
Chris@10 230 Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 231 Tk = BYTW(&(W[TWVL * 20]), Tj);
Chris@10 232 Tl = VSUB(Ti, Tk);
Chris@10 233 Ty = VADD(Ti, Tk);
Chris@10 234 Tz = VFNMS(LDK(KP500000000), Ty, Tx);
Chris@10 235 }
Chris@10 236 {
Chris@10 237 V Ts, TG, TF, TH;
Chris@10 238 {
Chris@10 239 V Tg, Tr, Tv, TE;
Chris@10 240 Tg = VSUB(T7, Tf);
Chris@10 241 Tr = VMUL(LDK(KP866025403), VSUB(Tl, Tq));
Chris@10 242 Ts = VSUB(Tg, Tr);
Chris@10 243 TG = VADD(Tg, Tr);
Chris@10 244 Tv = VMUL(LDK(KP866025403), VSUB(Tt, Tu));
Chris@10 245 TE = VSUB(Tz, TD);
Chris@10 246 TF = VBYI(VADD(Tv, TE));
Chris@10 247 TH = VBYI(VSUB(TE, Tv));
Chris@10 248 }
Chris@10 249 ST(&(x[WS(rs, 11)]), VSUB(Ts, TF), ms, &(x[WS(rs, 1)]));
Chris@10 250 ST(&(x[WS(rs, 5)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
Chris@10 251 ST(&(x[WS(rs, 1)]), VADD(Ts, TF), ms, &(x[WS(rs, 1)]));
Chris@10 252 ST(&(x[WS(rs, 7)]), VSUB(TG, TH), ms, &(x[WS(rs, 1)]));
Chris@10 253 }
Chris@10 254 {
Chris@10 255 V TS, TW, TV, TX;
Chris@10 256 {
Chris@10 257 V TQ, TR, TT, TU;
Chris@10 258 TQ = VADD(T1, T6);
Chris@10 259 TR = VADD(T9, Te);
Chris@10 260 TS = VSUB(TQ, TR);
Chris@10 261 TW = VADD(TQ, TR);
Chris@10 262 TT = VADD(Tx, Ty);
Chris@10 263 TU = VADD(TB, TC);
Chris@10 264 TV = VBYI(VSUB(TT, TU));
Chris@10 265 TX = VADD(TT, TU);
Chris@10 266 }
Chris@10 267 ST(&(x[WS(rs, 3)]), VSUB(TS, TV), ms, &(x[WS(rs, 1)]));
Chris@10 268 ST(&(x[0]), VADD(TW, TX), ms, &(x[0]));
Chris@10 269 ST(&(x[WS(rs, 9)]), VADD(TS, TV), ms, &(x[WS(rs, 1)]));
Chris@10 270 ST(&(x[WS(rs, 6)]), VSUB(TW, TX), ms, &(x[0]));
Chris@10 271 }
Chris@10 272 {
Chris@10 273 V TK, TO, TN, TP;
Chris@10 274 {
Chris@10 275 V TI, TJ, TL, TM;
Chris@10 276 TI = VADD(Tl, Tq);
Chris@10 277 TJ = VADD(Tt, Tu);
Chris@10 278 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
Chris@10 279 TO = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
Chris@10 280 TL = VADD(T7, Tf);
Chris@10 281 TM = VADD(Tz, TD);
Chris@10 282 TN = VSUB(TL, TM);
Chris@10 283 TP = VADD(TL, TM);
Chris@10 284 }
Chris@10 285 ST(&(x[WS(rs, 2)]), VADD(TK, TN), ms, &(x[0]));
Chris@10 286 ST(&(x[WS(rs, 8)]), VSUB(TP, TO), ms, &(x[0]));
Chris@10 287 ST(&(x[WS(rs, 10)]), VSUB(TN, TK), ms, &(x[0]));
Chris@10 288 ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
Chris@10 289 }
Chris@10 290 }
Chris@10 291 }
Chris@10 292 VLEAVE();
Chris@10 293 }
Chris@10 294
Chris@10 295 static const tw_instr twinstr[] = {
Chris@10 296 VTW(0, 1),
Chris@10 297 VTW(0, 2),
Chris@10 298 VTW(0, 3),
Chris@10 299 VTW(0, 4),
Chris@10 300 VTW(0, 5),
Chris@10 301 VTW(0, 6),
Chris@10 302 VTW(0, 7),
Chris@10 303 VTW(0, 8),
Chris@10 304 VTW(0, 9),
Chris@10 305 VTW(0, 10),
Chris@10 306 VTW(0, 11),
Chris@10 307 {TW_NEXT, VL, 0}
Chris@10 308 };
Chris@10 309
Chris@10 310 static const ct_desc desc = { 12, XSIMD_STRING("t1bv_12"), twinstr, &GENUS, {55, 26, 4, 0}, 0, 0, 0 };
Chris@10 311
Chris@10 312 void XSIMD(codelet_t1bv_12) (planner *p) {
Chris@10 313 X(kdft_dit_register) (p, t1bv_12, &desc);
Chris@10 314 }
Chris@10 315 #endif /* HAVE_FMA */