annotate src/fftw-3.3.3/dft/simd/common/t1fv_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:38:03 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1fv_12 -include t1f.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 59 FP additions, 42 FP multiplications,
Chris@10 32 * (or, 41 additions, 24 multiplications, 18 fused multiply/add),
Chris@10 33 * 41 stack variables, 2 constants, and 24 memory accesses
Chris@10 34 */
Chris@10 35 #include "t1f.h"
Chris@10 36
Chris@10 37 static void t1fv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 41 {
Chris@10 42 INT m;
Chris@10 43 R *x;
Chris@10 44 x = ri;
Chris@10 45 for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
Chris@10 46 V Tq, Ti, T7, TQ, Tu, TA, TU, Tk, TR, Tf, TE, TM;
Chris@10 47 {
Chris@10 48 V T9, TC, Tj, TD, Te;
Chris@10 49 {
Chris@10 50 V T1, T4, T2, Tm, Tx, To;
Chris@10 51 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 52 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 53 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 54 Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 55 Tx = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 56 To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 57 {
Chris@10 58 V T5, T3, Tn, Ty, Tp, Td, Tb, T8, Tc, Ta;
Chris@10 59 T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 60 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 61 Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 62 T5 = BYTWJ(&(W[TWVL * 14]), T4);
Chris@10 63 T3 = BYTWJ(&(W[TWVL * 6]), T2);
Chris@10 64 Tn = BYTWJ(&(W[0]), Tm);
Chris@10 65 Ty = BYTWJ(&(W[TWVL * 16]), Tx);
Chris@10 66 Tp = BYTWJ(&(W[TWVL * 8]), To);
Chris@10 67 T9 = BYTWJ(&(W[TWVL * 10]), T8);
Chris@10 68 Td = BYTWJ(&(W[TWVL * 2]), Tc);
Chris@10 69 Tb = BYTWJ(&(W[TWVL * 18]), Ta);
Chris@10 70 {
Chris@10 71 V Th, T6, Tt, Tz;
Chris@10 72 Th = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 73 TC = VSUB(T5, T3);
Chris@10 74 T6 = VADD(T3, T5);
Chris@10 75 Tt = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 76 Tz = VADD(Tn, Tp);
Chris@10 77 Tq = VSUB(Tn, Tp);
Chris@10 78 Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 79 TD = VSUB(Td, Tb);
Chris@10 80 Te = VADD(Tb, Td);
Chris@10 81 Ti = BYTWJ(&(W[TWVL * 20]), Th);
Chris@10 82 T7 = VFNMS(LDK(KP500000000), T6, T1);
Chris@10 83 TQ = VADD(T1, T6);
Chris@10 84 Tu = BYTWJ(&(W[TWVL * 4]), Tt);
Chris@10 85 TA = VFNMS(LDK(KP500000000), Tz, Ty);
Chris@10 86 TU = VADD(Ty, Tz);
Chris@10 87 }
Chris@10 88 }
Chris@10 89 }
Chris@10 90 Tk = BYTWJ(&(W[TWVL * 12]), Tj);
Chris@10 91 TR = VADD(T9, Te);
Chris@10 92 Tf = VFNMS(LDK(KP500000000), Te, T9);
Chris@10 93 TE = VSUB(TC, TD);
Chris@10 94 TM = VADD(TC, TD);
Chris@10 95 }
Chris@10 96 {
Chris@10 97 V Tv, Tl, TI, Tg, TW, TS;
Chris@10 98 Tv = VADD(Tk, Ti);
Chris@10 99 Tl = VSUB(Ti, Tk);
Chris@10 100 TI = VADD(T7, Tf);
Chris@10 101 Tg = VSUB(T7, Tf);
Chris@10 102 TW = VADD(TQ, TR);
Chris@10 103 TS = VSUB(TQ, TR);
Chris@10 104 {
Chris@10 105 V TT, Tw, TL, Tr;
Chris@10 106 TT = VADD(Tu, Tv);
Chris@10 107 Tw = VFNMS(LDK(KP500000000), Tv, Tu);
Chris@10 108 TL = VSUB(Tl, Tq);
Chris@10 109 Tr = VADD(Tl, Tq);
Chris@10 110 {
Chris@10 111 V TP, TN, TG, Ts, TO, TK, TH, TF;
Chris@10 112 {
Chris@10 113 V TX, TV, TJ, TB;
Chris@10 114 TX = VADD(TT, TU);
Chris@10 115 TV = VSUB(TT, TU);
Chris@10 116 TJ = VADD(Tw, TA);
Chris@10 117 TB = VSUB(Tw, TA);
Chris@10 118 TP = VMUL(LDK(KP866025403), VADD(TM, TL));
Chris@10 119 TN = VMUL(LDK(KP866025403), VSUB(TL, TM));
Chris@10 120 TG = VFNMS(LDK(KP866025403), Tr, Tg);
Chris@10 121 Ts = VFMA(LDK(KP866025403), Tr, Tg);
Chris@10 122 ST(&(x[WS(rs, 6)]), VSUB(TW, TX), ms, &(x[0]));
Chris@10 123 ST(&(x[0]), VADD(TW, TX), ms, &(x[0]));
Chris@10 124 ST(&(x[WS(rs, 3)]), VFMAI(TV, TS), ms, &(x[WS(rs, 1)]));
Chris@10 125 ST(&(x[WS(rs, 9)]), VFNMSI(TV, TS), ms, &(x[WS(rs, 1)]));
Chris@10 126 TO = VADD(TI, TJ);
Chris@10 127 TK = VSUB(TI, TJ);
Chris@10 128 TH = VFMA(LDK(KP866025403), TE, TB);
Chris@10 129 TF = VFNMS(LDK(KP866025403), TE, TB);
Chris@10 130 }
Chris@10 131 ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
Chris@10 132 ST(&(x[WS(rs, 8)]), VFNMSI(TP, TO), ms, &(x[0]));
Chris@10 133 ST(&(x[WS(rs, 10)]), VFNMSI(TN, TK), ms, &(x[0]));
Chris@10 134 ST(&(x[WS(rs, 2)]), VFMAI(TN, TK), ms, &(x[0]));
Chris@10 135 ST(&(x[WS(rs, 5)]), VFNMSI(TH, TG), ms, &(x[WS(rs, 1)]));
Chris@10 136 ST(&(x[WS(rs, 7)]), VFMAI(TH, TG), ms, &(x[WS(rs, 1)]));
Chris@10 137 ST(&(x[WS(rs, 11)]), VFMAI(TF, Ts), ms, &(x[WS(rs, 1)]));
Chris@10 138 ST(&(x[WS(rs, 1)]), VFNMSI(TF, Ts), ms, &(x[WS(rs, 1)]));
Chris@10 139 }
Chris@10 140 }
Chris@10 141 }
Chris@10 142 }
Chris@10 143 }
Chris@10 144 VLEAVE();
Chris@10 145 }
Chris@10 146
Chris@10 147 static const tw_instr twinstr[] = {
Chris@10 148 VTW(0, 1),
Chris@10 149 VTW(0, 2),
Chris@10 150 VTW(0, 3),
Chris@10 151 VTW(0, 4),
Chris@10 152 VTW(0, 5),
Chris@10 153 VTW(0, 6),
Chris@10 154 VTW(0, 7),
Chris@10 155 VTW(0, 8),
Chris@10 156 VTW(0, 9),
Chris@10 157 VTW(0, 10),
Chris@10 158 VTW(0, 11),
Chris@10 159 {TW_NEXT, VL, 0}
Chris@10 160 };
Chris@10 161
Chris@10 162 static const ct_desc desc = { 12, XSIMD_STRING("t1fv_12"), twinstr, &GENUS, {41, 24, 18, 0}, 0, 0, 0 };
Chris@10 163
Chris@10 164 void XSIMD(codelet_t1fv_12) (planner *p) {
Chris@10 165 X(kdft_dit_register) (p, t1fv_12, &desc);
Chris@10 166 }
Chris@10 167 #else /* HAVE_FMA */
Chris@10 168
Chris@10 169 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1fv_12 -include t1f.h */
Chris@10 170
Chris@10 171 /*
Chris@10 172 * This function contains 59 FP additions, 30 FP multiplications,
Chris@10 173 * (or, 55 additions, 26 multiplications, 4 fused multiply/add),
Chris@10 174 * 28 stack variables, 2 constants, and 24 memory accesses
Chris@10 175 */
Chris@10 176 #include "t1f.h"
Chris@10 177
Chris@10 178 static void t1fv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 179 {
Chris@10 180 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 181 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 182 {
Chris@10 183 INT m;
Chris@10 184 R *x;
Chris@10 185 x = ri;
Chris@10 186 for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
Chris@10 187 V T1, TH, T6, TA, Tq, TE, Tv, TL, T9, TI, Te, TB, Ti, TD, Tn;
Chris@10 188 V TK;
Chris@10 189 {
Chris@10 190 V T5, T3, T4, T2;
Chris@10 191 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 192 T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 193 T5 = BYTWJ(&(W[TWVL * 14]), T4);
Chris@10 194 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 195 T3 = BYTWJ(&(W[TWVL * 6]), T2);
Chris@10 196 TH = VSUB(T5, T3);
Chris@10 197 T6 = VADD(T3, T5);
Chris@10 198 TA = VFNMS(LDK(KP500000000), T6, T1);
Chris@10 199 }
Chris@10 200 {
Chris@10 201 V Tu, Ts, Tp, Tt, Tr;
Chris@10 202 Tp = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 203 Tq = BYTWJ(&(W[TWVL * 16]), Tp);
Chris@10 204 Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 205 Tu = BYTWJ(&(W[TWVL * 8]), Tt);
Chris@10 206 Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 207 Ts = BYTWJ(&(W[0]), Tr);
Chris@10 208 TE = VSUB(Tu, Ts);
Chris@10 209 Tv = VADD(Ts, Tu);
Chris@10 210 TL = VFNMS(LDK(KP500000000), Tv, Tq);
Chris@10 211 }
Chris@10 212 {
Chris@10 213 V Td, Tb, T8, Tc, Ta;
Chris@10 214 T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 215 T9 = BYTWJ(&(W[TWVL * 10]), T8);
Chris@10 216 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 217 Td = BYTWJ(&(W[TWVL * 2]), Tc);
Chris@10 218 Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 219 Tb = BYTWJ(&(W[TWVL * 18]), Ta);
Chris@10 220 TI = VSUB(Td, Tb);
Chris@10 221 Te = VADD(Tb, Td);
Chris@10 222 TB = VFNMS(LDK(KP500000000), Te, T9);
Chris@10 223 }
Chris@10 224 {
Chris@10 225 V Tm, Tk, Th, Tl, Tj;
Chris@10 226 Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 227 Ti = BYTWJ(&(W[TWVL * 4]), Th);
Chris@10 228 Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 229 Tm = BYTWJ(&(W[TWVL * 20]), Tl);
Chris@10 230 Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 231 Tk = BYTWJ(&(W[TWVL * 12]), Tj);
Chris@10 232 TD = VSUB(Tm, Tk);
Chris@10 233 Tn = VADD(Tk, Tm);
Chris@10 234 TK = VFNMS(LDK(KP500000000), Tn, Ti);
Chris@10 235 }
Chris@10 236 {
Chris@10 237 V Tg, Ty, Tx, Tz;
Chris@10 238 {
Chris@10 239 V T7, Tf, To, Tw;
Chris@10 240 T7 = VADD(T1, T6);
Chris@10 241 Tf = VADD(T9, Te);
Chris@10 242 Tg = VSUB(T7, Tf);
Chris@10 243 Ty = VADD(T7, Tf);
Chris@10 244 To = VADD(Ti, Tn);
Chris@10 245 Tw = VADD(Tq, Tv);
Chris@10 246 Tx = VBYI(VSUB(To, Tw));
Chris@10 247 Tz = VADD(To, Tw);
Chris@10 248 }
Chris@10 249 ST(&(x[WS(rs, 9)]), VSUB(Tg, Tx), ms, &(x[WS(rs, 1)]));
Chris@10 250 ST(&(x[0]), VADD(Ty, Tz), ms, &(x[0]));
Chris@10 251 ST(&(x[WS(rs, 3)]), VADD(Tg, Tx), ms, &(x[WS(rs, 1)]));
Chris@10 252 ST(&(x[WS(rs, 6)]), VSUB(Ty, Tz), ms, &(x[0]));
Chris@10 253 }
Chris@10 254 {
Chris@10 255 V TS, TW, TV, TX;
Chris@10 256 {
Chris@10 257 V TQ, TR, TT, TU;
Chris@10 258 TQ = VADD(TA, TB);
Chris@10 259 TR = VADD(TK, TL);
Chris@10 260 TS = VSUB(TQ, TR);
Chris@10 261 TW = VADD(TQ, TR);
Chris@10 262 TT = VADD(TD, TE);
Chris@10 263 TU = VADD(TH, TI);
Chris@10 264 TV = VBYI(VMUL(LDK(KP866025403), VSUB(TT, TU)));
Chris@10 265 TX = VBYI(VMUL(LDK(KP866025403), VADD(TU, TT)));
Chris@10 266 }
Chris@10 267 ST(&(x[WS(rs, 10)]), VSUB(TS, TV), ms, &(x[0]));
Chris@10 268 ST(&(x[WS(rs, 4)]), VADD(TW, TX), ms, &(x[0]));
Chris@10 269 ST(&(x[WS(rs, 2)]), VADD(TS, TV), ms, &(x[0]));
Chris@10 270 ST(&(x[WS(rs, 8)]), VSUB(TW, TX), ms, &(x[0]));
Chris@10 271 }
Chris@10 272 {
Chris@10 273 V TG, TP, TN, TO;
Chris@10 274 {
Chris@10 275 V TC, TF, TJ, TM;
Chris@10 276 TC = VSUB(TA, TB);
Chris@10 277 TF = VMUL(LDK(KP866025403), VSUB(TD, TE));
Chris@10 278 TG = VSUB(TC, TF);
Chris@10 279 TP = VADD(TC, TF);
Chris@10 280 TJ = VMUL(LDK(KP866025403), VSUB(TH, TI));
Chris@10 281 TM = VSUB(TK, TL);
Chris@10 282 TN = VBYI(VADD(TJ, TM));
Chris@10 283 TO = VBYI(VSUB(TJ, TM));
Chris@10 284 }
Chris@10 285 ST(&(x[WS(rs, 5)]), VSUB(TG, TN), ms, &(x[WS(rs, 1)]));
Chris@10 286 ST(&(x[WS(rs, 11)]), VSUB(TP, TO), ms, &(x[WS(rs, 1)]));
Chris@10 287 ST(&(x[WS(rs, 7)]), VADD(TN, TG), ms, &(x[WS(rs, 1)]));
Chris@10 288 ST(&(x[WS(rs, 1)]), VADD(TO, TP), ms, &(x[WS(rs, 1)]));
Chris@10 289 }
Chris@10 290 }
Chris@10 291 }
Chris@10 292 VLEAVE();
Chris@10 293 }
Chris@10 294
Chris@10 295 static const tw_instr twinstr[] = {
Chris@10 296 VTW(0, 1),
Chris@10 297 VTW(0, 2),
Chris@10 298 VTW(0, 3),
Chris@10 299 VTW(0, 4),
Chris@10 300 VTW(0, 5),
Chris@10 301 VTW(0, 6),
Chris@10 302 VTW(0, 7),
Chris@10 303 VTW(0, 8),
Chris@10 304 VTW(0, 9),
Chris@10 305 VTW(0, 10),
Chris@10 306 VTW(0, 11),
Chris@10 307 {TW_NEXT, VL, 0}
Chris@10 308 };
Chris@10 309
Chris@10 310 static const ct_desc desc = { 12, XSIMD_STRING("t1fv_12"), twinstr, &GENUS, {55, 26, 4, 0}, 0, 0, 0 };
Chris@10 311
Chris@10 312 void XSIMD(codelet_t1fv_12) (planner *p) {
Chris@10 313 X(kdft_dit_register) (p, t1fv_12, &desc);
Chris@10 314 }
Chris@10 315 #endif /* HAVE_FMA */