annotate src/fftw-3.3.3/dft/simd/common/t1bv_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:04 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1bv_9 -include t1b.h -sign 1 */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 54 FP additions, 54 FP multiplications,
Chris@10 32 * (or, 20 additions, 20 multiplications, 34 fused multiply/add),
Chris@10 33 * 67 stack variables, 19 constants, and 18 memory accesses
Chris@10 34 */
Chris@10 35 #include "t1b.h"
Chris@10 36
Chris@10 37 static void t1bv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@10 40 DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
Chris@10 41 DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
Chris@10 42 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@10 43 DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
Chris@10 44 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@10 45 DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
Chris@10 46 DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
Chris@10 47 DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
Chris@10 48 DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
Chris@10 49 DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
Chris@10 50 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 51 DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
Chris@10 52 DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
Chris@10 53 DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
Chris@10 54 DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
Chris@10 55 DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
Chris@10 56 DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
Chris@10 57 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 58 {
Chris@10 59 INT m;
Chris@10 60 R *x;
Chris@10 61 x = ii;
Chris@10 62 for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
Chris@10 63 V T1, T3, T5, T9, Tn, Tb, Td, Th, Tj, Tx, T6;
Chris@10 64 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 65 {
Chris@10 66 V T2, T4, T8, Tm;
Chris@10 67 T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 68 T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 69 T8 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 70 Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 71 {
Chris@10 72 V Ta, Tc, Tg, Ti;
Chris@10 73 Ta = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 74 Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 75 Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 76 Ti = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 77 T3 = BYTW(&(W[TWVL * 4]), T2);
Chris@10 78 T5 = BYTW(&(W[TWVL * 10]), T4);
Chris@10 79 T9 = BYTW(&(W[TWVL * 2]), T8);
Chris@10 80 Tn = BYTW(&(W[0]), Tm);
Chris@10 81 Tb = BYTW(&(W[TWVL * 8]), Ta);
Chris@10 82 Td = BYTW(&(W[TWVL * 14]), Tc);
Chris@10 83 Th = BYTW(&(W[TWVL * 6]), Tg);
Chris@10 84 Tj = BYTW(&(W[TWVL * 12]), Ti);
Chris@10 85 }
Chris@10 86 }
Chris@10 87 Tx = VSUB(T3, T5);
Chris@10 88 T6 = VADD(T3, T5);
Chris@10 89 {
Chris@10 90 V Tl, Te, Tk, To, T7, TN;
Chris@10 91 Tl = VSUB(Td, Tb);
Chris@10 92 Te = VADD(Tb, Td);
Chris@10 93 Tk = VSUB(Th, Tj);
Chris@10 94 To = VADD(Th, Tj);
Chris@10 95 T7 = VFNMS(LDK(KP500000000), T6, T1);
Chris@10 96 TN = VADD(T1, T6);
Chris@10 97 {
Chris@10 98 V Tf, TP, Tp, TO;
Chris@10 99 Tf = VFNMS(LDK(KP500000000), Te, T9);
Chris@10 100 TP = VADD(T9, Te);
Chris@10 101 Tp = VFNMS(LDK(KP500000000), To, Tn);
Chris@10 102 TO = VADD(Tn, To);
Chris@10 103 {
Chris@10 104 V Tz, TC, Tu, TD, TA, Tq, TQ, TS;
Chris@10 105 Tz = VFNMS(LDK(KP152703644), Tl, Tf);
Chris@10 106 TC = VFMA(LDK(KP203604859), Tf, Tl);
Chris@10 107 Tu = VFNMS(LDK(KP439692620), Tk, Tf);
Chris@10 108 TD = VFNMS(LDK(KP726681596), Tk, Tp);
Chris@10 109 TA = VFMA(LDK(KP968908795), Tp, Tk);
Chris@10 110 Tq = VFNMS(LDK(KP586256827), Tp, Tl);
Chris@10 111 TQ = VADD(TO, TP);
Chris@10 112 TS = VMUL(LDK(KP866025403), VSUB(TO, TP));
Chris@10 113 {
Chris@10 114 V TI, TB, TH, TE, Tr, TR, Tw, Tv;
Chris@10 115 Tv = VFNMS(LDK(KP420276625), Tu, Tl);
Chris@10 116 TI = VFMA(LDK(KP673648177), TA, Tz);
Chris@10 117 TB = VFNMS(LDK(KP673648177), TA, Tz);
Chris@10 118 TH = VFNMS(LDK(KP898197570), TD, TC);
Chris@10 119 TE = VFMA(LDK(KP898197570), TD, TC);
Chris@10 120 Tr = VFNMS(LDK(KP347296355), Tq, Tk);
Chris@10 121 ST(&(x[0]), VADD(TQ, TN), ms, &(x[0]));
Chris@10 122 TR = VFNMS(LDK(KP500000000), TQ, TN);
Chris@10 123 Tw = VFNMS(LDK(KP826351822), Tv, Tp);
Chris@10 124 {
Chris@10 125 V TM, TL, TF, TJ, Ts, Ty, TG, TK, Tt;
Chris@10 126 TM = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tx, TI));
Chris@10 127 TL = VFMA(LDK(KP852868531), TE, T7);
Chris@10 128 TF = VFNMS(LDK(KP500000000), TE, TB);
Chris@10 129 TJ = VFMA(LDK(KP666666666), TI, TH);
Chris@10 130 Ts = VFNMS(LDK(KP907603734), Tr, Tf);
Chris@10 131 ST(&(x[WS(rs, 6)]), VFNMSI(TS, TR), ms, &(x[0]));
Chris@10 132 ST(&(x[WS(rs, 3)]), VFMAI(TS, TR), ms, &(x[WS(rs, 1)]));
Chris@10 133 Ty = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tx, Tw));
Chris@10 134 ST(&(x[WS(rs, 8)]), VFNMSI(TM, TL), ms, &(x[0]));
Chris@10 135 ST(&(x[WS(rs, 1)]), VFMAI(TM, TL), ms, &(x[WS(rs, 1)]));
Chris@10 136 TG = VFMA(LDK(KP852868531), TF, T7);
Chris@10 137 TK = VMUL(LDK(KP866025403), VFNMS(LDK(KP852868531), TJ, Tx));
Chris@10 138 Tt = VFNMS(LDK(KP939692620), Ts, T7);
Chris@10 139 ST(&(x[WS(rs, 5)]), VFNMSI(TK, TG), ms, &(x[WS(rs, 1)]));
Chris@10 140 ST(&(x[WS(rs, 4)]), VFMAI(TK, TG), ms, &(x[0]));
Chris@10 141 ST(&(x[WS(rs, 2)]), VFMAI(Ty, Tt), ms, &(x[0]));
Chris@10 142 ST(&(x[WS(rs, 7)]), VFNMSI(Ty, Tt), ms, &(x[WS(rs, 1)]));
Chris@10 143 }
Chris@10 144 }
Chris@10 145 }
Chris@10 146 }
Chris@10 147 }
Chris@10 148 }
Chris@10 149 }
Chris@10 150 VLEAVE();
Chris@10 151 }
Chris@10 152
Chris@10 153 static const tw_instr twinstr[] = {
Chris@10 154 VTW(0, 1),
Chris@10 155 VTW(0, 2),
Chris@10 156 VTW(0, 3),
Chris@10 157 VTW(0, 4),
Chris@10 158 VTW(0, 5),
Chris@10 159 VTW(0, 6),
Chris@10 160 VTW(0, 7),
Chris@10 161 VTW(0, 8),
Chris@10 162 {TW_NEXT, VL, 0}
Chris@10 163 };
Chris@10 164
Chris@10 165 static const ct_desc desc = { 9, XSIMD_STRING("t1bv_9"), twinstr, &GENUS, {20, 20, 34, 0}, 0, 0, 0 };
Chris@10 166
Chris@10 167 void XSIMD(codelet_t1bv_9) (planner *p) {
Chris@10 168 X(kdft_dit_register) (p, t1bv_9, &desc);
Chris@10 169 }
Chris@10 170 #else /* HAVE_FMA */
Chris@10 171
Chris@10 172 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1bv_9 -include t1b.h -sign 1 */
Chris@10 173
Chris@10 174 /*
Chris@10 175 * This function contains 54 FP additions, 42 FP multiplications,
Chris@10 176 * (or, 38 additions, 26 multiplications, 16 fused multiply/add),
Chris@10 177 * 38 stack variables, 14 constants, and 18 memory accesses
Chris@10 178 */
Chris@10 179 #include "t1b.h"
Chris@10 180
Chris@10 181 static void t1bv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 182 {
Chris@10 183 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@10 184 DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
Chris@10 185 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@10 186 DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@10 187 DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
Chris@10 188 DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@10 189 DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@10 190 DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
Chris@10 191 DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
Chris@10 192 DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@10 193 DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
Chris@10 194 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@10 195 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 196 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 197 {
Chris@10 198 INT m;
Chris@10 199 R *x;
Chris@10 200 x = ii;
Chris@10 201 for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
Chris@10 202 V T1, T6, Tu, Tg, Tf, TD, Tq, Tp, TE;
Chris@10 203 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 204 {
Chris@10 205 V T3, T5, T2, T4;
Chris@10 206 T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 207 T3 = BYTW(&(W[TWVL * 4]), T2);
Chris@10 208 T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 209 T5 = BYTW(&(W[TWVL * 10]), T4);
Chris@10 210 T6 = VADD(T3, T5);
Chris@10 211 Tu = VMUL(LDK(KP866025403), VSUB(T3, T5));
Chris@10 212 }
Chris@10 213 {
Chris@10 214 V T9, Td, Tb, T8, Tc, Ta, Te;
Chris@10 215 T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 216 T9 = BYTW(&(W[0]), T8);
Chris@10 217 Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 218 Td = BYTW(&(W[TWVL * 12]), Tc);
Chris@10 219 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 220 Tb = BYTW(&(W[TWVL * 6]), Ta);
Chris@10 221 Tg = VSUB(Tb, Td);
Chris@10 222 Te = VADD(Tb, Td);
Chris@10 223 Tf = VFNMS(LDK(KP500000000), Te, T9);
Chris@10 224 TD = VADD(T9, Te);
Chris@10 225 }
Chris@10 226 {
Chris@10 227 V Tj, Tn, Tl, Ti, Tm, Tk, To;
Chris@10 228 Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 229 Tj = BYTW(&(W[TWVL * 2]), Ti);
Chris@10 230 Tm = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 231 Tn = BYTW(&(W[TWVL * 14]), Tm);
Chris@10 232 Tk = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 233 Tl = BYTW(&(W[TWVL * 8]), Tk);
Chris@10 234 Tq = VSUB(Tl, Tn);
Chris@10 235 To = VADD(Tl, Tn);
Chris@10 236 Tp = VFNMS(LDK(KP500000000), To, Tj);
Chris@10 237 TE = VADD(Tj, To);
Chris@10 238 }
Chris@10 239 {
Chris@10 240 V TF, TG, TH, TI;
Chris@10 241 TF = VBYI(VMUL(LDK(KP866025403), VSUB(TD, TE)));
Chris@10 242 TG = VADD(T1, T6);
Chris@10 243 TH = VADD(TD, TE);
Chris@10 244 TI = VFNMS(LDK(KP500000000), TH, TG);
Chris@10 245 ST(&(x[WS(rs, 3)]), VADD(TF, TI), ms, &(x[WS(rs, 1)]));
Chris@10 246 ST(&(x[0]), VADD(TG, TH), ms, &(x[0]));
Chris@10 247 ST(&(x[WS(rs, 6)]), VSUB(TI, TF), ms, &(x[0]));
Chris@10 248 }
Chris@10 249 {
Chris@10 250 V TC, Tv, Tw, Tx, Th, Tr, Ts, T7, TB;
Chris@10 251 TC = VBYI(VSUB(VFMA(LDK(KP984807753), Tf, VFMA(LDK(KP813797681), Tq, VFNMS(LDK(KP150383733), Tg, VMUL(LDK(KP342020143), Tp)))), Tu));
Chris@10 252 Tv = VFMA(LDK(KP663413948), Tg, VMUL(LDK(KP642787609), Tf));
Chris@10 253 Tw = VFMA(LDK(KP150383733), Tq, VMUL(LDK(KP984807753), Tp));
Chris@10 254 Tx = VADD(Tv, Tw);
Chris@10 255 Th = VFNMS(LDK(KP556670399), Tg, VMUL(LDK(KP766044443), Tf));
Chris@10 256 Tr = VFNMS(LDK(KP852868531), Tq, VMUL(LDK(KP173648177), Tp));
Chris@10 257 Ts = VADD(Th, Tr);
Chris@10 258 T7 = VFNMS(LDK(KP500000000), T6, T1);
Chris@10 259 TB = VFMA(LDK(KP852868531), Tg, VFMA(LDK(KP173648177), Tf, VFMA(LDK(KP296198132), Tq, VFNMS(LDK(KP939692620), Tp, T7))));
Chris@10 260 ST(&(x[WS(rs, 7)]), VSUB(TB, TC), ms, &(x[WS(rs, 1)]));
Chris@10 261 ST(&(x[WS(rs, 2)]), VADD(TB, TC), ms, &(x[0]));
Chris@10 262 {
Chris@10 263 V Tt, Ty, Tz, TA;
Chris@10 264 Tt = VADD(T7, Ts);
Chris@10 265 Ty = VBYI(VADD(Tu, Tx));
Chris@10 266 ST(&(x[WS(rs, 8)]), VSUB(Tt, Ty), ms, &(x[0]));
Chris@10 267 ST(&(x[WS(rs, 1)]), VADD(Tt, Ty), ms, &(x[WS(rs, 1)]));
Chris@10 268 Tz = VBYI(VADD(Tu, VFNMS(LDK(KP500000000), Tx, VMUL(LDK(KP866025403), VSUB(Th, Tr)))));
Chris@10 269 TA = VFMA(LDK(KP866025403), VSUB(Tw, Tv), VFNMS(LDK(KP500000000), Ts, T7));
Chris@10 270 ST(&(x[WS(rs, 4)]), VADD(Tz, TA), ms, &(x[0]));
Chris@10 271 ST(&(x[WS(rs, 5)]), VSUB(TA, Tz), ms, &(x[WS(rs, 1)]));
Chris@10 272 }
Chris@10 273 }
Chris@10 274 }
Chris@10 275 }
Chris@10 276 VLEAVE();
Chris@10 277 }
Chris@10 278
Chris@10 279 static const tw_instr twinstr[] = {
Chris@10 280 VTW(0, 1),
Chris@10 281 VTW(0, 2),
Chris@10 282 VTW(0, 3),
Chris@10 283 VTW(0, 4),
Chris@10 284 VTW(0, 5),
Chris@10 285 VTW(0, 6),
Chris@10 286 VTW(0, 7),
Chris@10 287 VTW(0, 8),
Chris@10 288 {TW_NEXT, VL, 0}
Chris@10 289 };
Chris@10 290
Chris@10 291 static const ct_desc desc = { 9, XSIMD_STRING("t1bv_9"), twinstr, &GENUS, {38, 26, 16, 0}, 0, 0, 0 };
Chris@10 292
Chris@10 293 void XSIMD(codelet_t1bv_9) (planner *p) {
Chris@10 294 X(kdft_dit_register) (p, t1bv_9, &desc);
Chris@10 295 }
Chris@10 296 #endif /* HAVE_FMA */