annotate src/fftw-3.3.8/dft/simd/common/t1bv_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:58 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1bv_9 -include dft/simd/t1b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 54 FP additions, 54 FP multiplications,
Chris@82 32 * (or, 20 additions, 20 multiplications, 34 fused multiply/add),
Chris@82 33 * 50 stack variables, 19 constants, and 18 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t1b.h"
Chris@82 36
Chris@82 37 static void t1bv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@82 40 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@82 41 DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
Chris@82 42 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@82 43 DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
Chris@82 44 DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
Chris@82 45 DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
Chris@82 46 DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
Chris@82 47 DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
Chris@82 48 DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
Chris@82 49 DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
Chris@82 50 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 51 DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
Chris@82 52 DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
Chris@82 53 DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
Chris@82 54 DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
Chris@82 55 DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
Chris@82 56 DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
Chris@82 57 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 58 {
Chris@82 59 INT m;
Chris@82 60 R *x;
Chris@82 61 x = ii;
Chris@82 62 for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
Chris@82 63 V T1, T6, Tx, TO, TP, Tf, Tp, Tk, Tl, Tq, Tu, TD, TC, TA, Tz;
Chris@82 64 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 65 {
Chris@82 66 V T3, T5, T2, T4;
Chris@82 67 T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 68 T3 = BYTW(&(W[TWVL * 4]), T2);
Chris@82 69 T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 70 T5 = BYTW(&(W[TWVL * 10]), T4);
Chris@82 71 T6 = VADD(T3, T5);
Chris@82 72 Tx = VSUB(T3, T5);
Chris@82 73 }
Chris@82 74 {
Chris@82 75 V T9, Tn, Tb, Td, Te, Th, Tj, To, T8, Tm;
Chris@82 76 T8 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 77 T9 = BYTW(&(W[TWVL * 2]), T8);
Chris@82 78 Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 79 Tn = BYTW(&(W[0]), Tm);
Chris@82 80 {
Chris@82 81 V Ta, Tc, Tg, Ti;
Chris@82 82 Ta = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 83 Tb = BYTW(&(W[TWVL * 8]), Ta);
Chris@82 84 Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 85 Td = BYTW(&(W[TWVL * 14]), Tc);
Chris@82 86 Te = VADD(Tb, Td);
Chris@82 87 Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 88 Th = BYTW(&(W[TWVL * 6]), Tg);
Chris@82 89 Ti = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 90 Tj = BYTW(&(W[TWVL * 12]), Ti);
Chris@82 91 To = VADD(Th, Tj);
Chris@82 92 }
Chris@82 93 TO = VADD(Tn, To);
Chris@82 94 TP = VADD(T9, Te);
Chris@82 95 Tf = VFNMS(LDK(KP500000000), Te, T9);
Chris@82 96 Tp = VFNMS(LDK(KP500000000), To, Tn);
Chris@82 97 Tk = VSUB(Th, Tj);
Chris@82 98 Tl = VSUB(Td, Tb);
Chris@82 99 Tq = VFNMS(LDK(KP586256827), Tp, Tl);
Chris@82 100 Tu = VFNMS(LDK(KP439692620), Tk, Tf);
Chris@82 101 TD = VFNMS(LDK(KP726681596), Tk, Tp);
Chris@82 102 TC = VFMA(LDK(KP203604859), Tf, Tl);
Chris@82 103 TA = VFMA(LDK(KP968908795), Tp, Tk);
Chris@82 104 Tz = VFNMS(LDK(KP152703644), Tl, Tf);
Chris@82 105 }
Chris@82 106 {
Chris@82 107 V TS, TN, TQ, TR;
Chris@82 108 TS = VMUL(LDK(KP866025403), VSUB(TO, TP));
Chris@82 109 TN = VADD(T1, T6);
Chris@82 110 TQ = VADD(TO, TP);
Chris@82 111 TR = VFNMS(LDK(KP500000000), TQ, TN);
Chris@82 112 ST(&(x[WS(rs, 3)]), VFMAI(TS, TR), ms, &(x[WS(rs, 1)]));
Chris@82 113 ST(&(x[0]), VADD(TQ, TN), ms, &(x[0]));
Chris@82 114 ST(&(x[WS(rs, 6)]), VFNMSI(TS, TR), ms, &(x[0]));
Chris@82 115 }
Chris@82 116 {
Chris@82 117 V Ts, Tw, TJ, TM, T7, TF, TL, Tr, Tv;
Chris@82 118 Tr = VFNMS(LDK(KP347296355), Tq, Tk);
Chris@82 119 Ts = VFNMS(LDK(KP907603734), Tr, Tf);
Chris@82 120 Tv = VFNMS(LDK(KP420276625), Tu, Tl);
Chris@82 121 Tw = VFNMS(LDK(KP826351822), Tv, Tp);
Chris@82 122 {
Chris@82 123 V TH, TI, TE, TB;
Chris@82 124 TH = VFNMS(LDK(KP898197570), TD, TC);
Chris@82 125 TI = VFMA(LDK(KP673648177), TA, Tz);
Chris@82 126 TJ = VFMA(LDK(KP666666666), TI, TH);
Chris@82 127 TM = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tx, TI));
Chris@82 128 T7 = VFNMS(LDK(KP500000000), T6, T1);
Chris@82 129 TE = VFMA(LDK(KP898197570), TD, TC);
Chris@82 130 TB = VFNMS(LDK(KP673648177), TA, Tz);
Chris@82 131 TF = VFNMS(LDK(KP500000000), TE, TB);
Chris@82 132 TL = VFMA(LDK(KP852868531), TE, T7);
Chris@82 133 }
Chris@82 134 ST(&(x[WS(rs, 1)]), VFMAI(TM, TL), ms, &(x[WS(rs, 1)]));
Chris@82 135 ST(&(x[WS(rs, 8)]), VFNMSI(TM, TL), ms, &(x[0]));
Chris@82 136 {
Chris@82 137 V Tt, Ty, TG, TK;
Chris@82 138 Tt = VFNMS(LDK(KP939692620), Ts, T7);
Chris@82 139 Ty = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tx, Tw));
Chris@82 140 ST(&(x[WS(rs, 7)]), VFNMSI(Ty, Tt), ms, &(x[WS(rs, 1)]));
Chris@82 141 ST(&(x[WS(rs, 2)]), VFMAI(Ty, Tt), ms, &(x[0]));
Chris@82 142 TG = VFMA(LDK(KP852868531), TF, T7);
Chris@82 143 TK = VMUL(LDK(KP866025403), VFNMS(LDK(KP852868531), TJ, Tx));
Chris@82 144 ST(&(x[WS(rs, 4)]), VFMAI(TK, TG), ms, &(x[0]));
Chris@82 145 ST(&(x[WS(rs, 5)]), VFNMSI(TK, TG), ms, &(x[WS(rs, 1)]));
Chris@82 146 }
Chris@82 147 }
Chris@82 148 }
Chris@82 149 }
Chris@82 150 VLEAVE();
Chris@82 151 }
Chris@82 152
Chris@82 153 static const tw_instr twinstr[] = {
Chris@82 154 VTW(0, 1),
Chris@82 155 VTW(0, 2),
Chris@82 156 VTW(0, 3),
Chris@82 157 VTW(0, 4),
Chris@82 158 VTW(0, 5),
Chris@82 159 VTW(0, 6),
Chris@82 160 VTW(0, 7),
Chris@82 161 VTW(0, 8),
Chris@82 162 {TW_NEXT, VL, 0}
Chris@82 163 };
Chris@82 164
Chris@82 165 static const ct_desc desc = { 9, XSIMD_STRING("t1bv_9"), twinstr, &GENUS, {20, 20, 34, 0}, 0, 0, 0 };
Chris@82 166
Chris@82 167 void XSIMD(codelet_t1bv_9) (planner *p) {
Chris@82 168 X(kdft_dit_register) (p, t1bv_9, &desc);
Chris@82 169 }
Chris@82 170 #else
Chris@82 171
Chris@82 172 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1bv_9 -include dft/simd/t1b.h -sign 1 */
Chris@82 173
Chris@82 174 /*
Chris@82 175 * This function contains 54 FP additions, 42 FP multiplications,
Chris@82 176 * (or, 38 additions, 26 multiplications, 16 fused multiply/add),
Chris@82 177 * 38 stack variables, 14 constants, and 18 memory accesses
Chris@82 178 */
Chris@82 179 #include "dft/simd/t1b.h"
Chris@82 180
Chris@82 181 static void t1bv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 182 {
Chris@82 183 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@82 184 DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
Chris@82 185 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@82 186 DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@82 187 DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
Chris@82 188 DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@82 189 DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@82 190 DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
Chris@82 191 DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
Chris@82 192 DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@82 193 DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
Chris@82 194 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@82 195 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 196 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 197 {
Chris@82 198 INT m;
Chris@82 199 R *x;
Chris@82 200 x = ii;
Chris@82 201 for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
Chris@82 202 V T1, T6, Tu, Tg, Tf, TD, Tq, Tp, TE;
Chris@82 203 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 204 {
Chris@82 205 V T3, T5, T2, T4;
Chris@82 206 T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 207 T3 = BYTW(&(W[TWVL * 4]), T2);
Chris@82 208 T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 209 T5 = BYTW(&(W[TWVL * 10]), T4);
Chris@82 210 T6 = VADD(T3, T5);
Chris@82 211 Tu = VMUL(LDK(KP866025403), VSUB(T3, T5));
Chris@82 212 }
Chris@82 213 {
Chris@82 214 V T9, Td, Tb, T8, Tc, Ta, Te;
Chris@82 215 T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 216 T9 = BYTW(&(W[0]), T8);
Chris@82 217 Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 218 Td = BYTW(&(W[TWVL * 12]), Tc);
Chris@82 219 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 220 Tb = BYTW(&(W[TWVL * 6]), Ta);
Chris@82 221 Tg = VSUB(Tb, Td);
Chris@82 222 Te = VADD(Tb, Td);
Chris@82 223 Tf = VFNMS(LDK(KP500000000), Te, T9);
Chris@82 224 TD = VADD(T9, Te);
Chris@82 225 }
Chris@82 226 {
Chris@82 227 V Tj, Tn, Tl, Ti, Tm, Tk, To;
Chris@82 228 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 229 Tj = BYTW(&(W[TWVL * 2]), Ti);
Chris@82 230 Tm = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 231 Tn = BYTW(&(W[TWVL * 14]), Tm);
Chris@82 232 Tk = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 233 Tl = BYTW(&(W[TWVL * 8]), Tk);
Chris@82 234 Tq = VSUB(Tl, Tn);
Chris@82 235 To = VADD(Tl, Tn);
Chris@82 236 Tp = VFNMS(LDK(KP500000000), To, Tj);
Chris@82 237 TE = VADD(Tj, To);
Chris@82 238 }
Chris@82 239 {
Chris@82 240 V TF, TG, TH, TI;
Chris@82 241 TF = VBYI(VMUL(LDK(KP866025403), VSUB(TD, TE)));
Chris@82 242 TG = VADD(T1, T6);
Chris@82 243 TH = VADD(TD, TE);
Chris@82 244 TI = VFNMS(LDK(KP500000000), TH, TG);
Chris@82 245 ST(&(x[WS(rs, 3)]), VADD(TF, TI), ms, &(x[WS(rs, 1)]));
Chris@82 246 ST(&(x[0]), VADD(TG, TH), ms, &(x[0]));
Chris@82 247 ST(&(x[WS(rs, 6)]), VSUB(TI, TF), ms, &(x[0]));
Chris@82 248 }
Chris@82 249 {
Chris@82 250 V TC, Tv, Tw, Tx, Th, Tr, Ts, T7, TB;
Chris@82 251 TC = VBYI(VSUB(VFMA(LDK(KP984807753), Tf, VFMA(LDK(KP813797681), Tq, VFNMS(LDK(KP150383733), Tg, VMUL(LDK(KP342020143), Tp)))), Tu));
Chris@82 252 Tv = VFMA(LDK(KP663413948), Tg, VMUL(LDK(KP642787609), Tf));
Chris@82 253 Tw = VFMA(LDK(KP150383733), Tq, VMUL(LDK(KP984807753), Tp));
Chris@82 254 Tx = VADD(Tv, Tw);
Chris@82 255 Th = VFNMS(LDK(KP556670399), Tg, VMUL(LDK(KP766044443), Tf));
Chris@82 256 Tr = VFNMS(LDK(KP852868531), Tq, VMUL(LDK(KP173648177), Tp));
Chris@82 257 Ts = VADD(Th, Tr);
Chris@82 258 T7 = VFNMS(LDK(KP500000000), T6, T1);
Chris@82 259 TB = VFMA(LDK(KP852868531), Tg, VFMA(LDK(KP173648177), Tf, VFMA(LDK(KP296198132), Tq, VFNMS(LDK(KP939692620), Tp, T7))));
Chris@82 260 ST(&(x[WS(rs, 7)]), VSUB(TB, TC), ms, &(x[WS(rs, 1)]));
Chris@82 261 ST(&(x[WS(rs, 2)]), VADD(TB, TC), ms, &(x[0]));
Chris@82 262 {
Chris@82 263 V Tt, Ty, Tz, TA;
Chris@82 264 Tt = VADD(T7, Ts);
Chris@82 265 Ty = VBYI(VADD(Tu, Tx));
Chris@82 266 ST(&(x[WS(rs, 8)]), VSUB(Tt, Ty), ms, &(x[0]));
Chris@82 267 ST(&(x[WS(rs, 1)]), VADD(Tt, Ty), ms, &(x[WS(rs, 1)]));
Chris@82 268 Tz = VBYI(VADD(Tu, VFNMS(LDK(KP500000000), Tx, VMUL(LDK(KP866025403), VSUB(Th, Tr)))));
Chris@82 269 TA = VFMA(LDK(KP866025403), VSUB(Tw, Tv), VFNMS(LDK(KP500000000), Ts, T7));
Chris@82 270 ST(&(x[WS(rs, 4)]), VADD(Tz, TA), ms, &(x[0]));
Chris@82 271 ST(&(x[WS(rs, 5)]), VSUB(TA, Tz), ms, &(x[WS(rs, 1)]));
Chris@82 272 }
Chris@82 273 }
Chris@82 274 }
Chris@82 275 }
Chris@82 276 VLEAVE();
Chris@82 277 }
Chris@82 278
Chris@82 279 static const tw_instr twinstr[] = {
Chris@82 280 VTW(0, 1),
Chris@82 281 VTW(0, 2),
Chris@82 282 VTW(0, 3),
Chris@82 283 VTW(0, 4),
Chris@82 284 VTW(0, 5),
Chris@82 285 VTW(0, 6),
Chris@82 286 VTW(0, 7),
Chris@82 287 VTW(0, 8),
Chris@82 288 {TW_NEXT, VL, 0}
Chris@82 289 };
Chris@82 290
Chris@82 291 static const ct_desc desc = { 9, XSIMD_STRING("t1bv_9"), twinstr, &GENUS, {38, 26, 16, 0}, 0, 0, 0 };
Chris@82 292
Chris@82 293 void XSIMD(codelet_t1bv_9) (planner *p) {
Chris@82 294 X(kdft_dit_register) (p, t1bv_9, &desc);
Chris@82 295 }
Chris@82 296 #endif