annotate src/fftw-3.3.3/dft/simd/common/t1fuv_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:38:00 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1fuv_9 -include t1fu.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 54 FP additions, 54 FP multiplications,
Chris@10 32 * (or, 20 additions, 20 multiplications, 34 fused multiply/add),
Chris@10 33 * 67 stack variables, 19 constants, and 18 memory accesses
Chris@10 34 */
Chris@10 35 #include "t1fu.h"
Chris@10 36
Chris@10 37 static void t1fuv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@10 40 DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
Chris@10 41 DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
Chris@10 42 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@10 43 DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
Chris@10 44 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@10 45 DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
Chris@10 46 DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
Chris@10 47 DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
Chris@10 48 DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
Chris@10 49 DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
Chris@10 50 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 51 DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
Chris@10 52 DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
Chris@10 53 DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
Chris@10 54 DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
Chris@10 55 DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
Chris@10 56 DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
Chris@10 57 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 58 {
Chris@10 59 INT m;
Chris@10 60 R *x;
Chris@10 61 x = ri;
Chris@10 62 for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
Chris@10 63 V T1, T3, T5, T9, Th, Tb, Td, Tj, Tl, TD, T6;
Chris@10 64 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 65 {
Chris@10 66 V T2, T4, T8, Tg;
Chris@10 67 T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 68 T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 69 T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 70 Tg = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 71 {
Chris@10 72 V Ta, Tc, Ti, Tk;
Chris@10 73 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 74 Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 75 Ti = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 76 Tk = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 77 T3 = BYTWJ(&(W[TWVL * 4]), T2);
Chris@10 78 T5 = BYTWJ(&(W[TWVL * 10]), T4);
Chris@10 79 T9 = BYTWJ(&(W[0]), T8);
Chris@10 80 Th = BYTWJ(&(W[TWVL * 2]), Tg);
Chris@10 81 Tb = BYTWJ(&(W[TWVL * 6]), Ta);
Chris@10 82 Td = BYTWJ(&(W[TWVL * 12]), Tc);
Chris@10 83 Tj = BYTWJ(&(W[TWVL * 8]), Ti);
Chris@10 84 Tl = BYTWJ(&(W[TWVL * 14]), Tk);
Chris@10 85 }
Chris@10 86 }
Chris@10 87 TD = VSUB(T5, T3);
Chris@10 88 T6 = VADD(T3, T5);
Chris@10 89 {
Chris@10 90 V Tt, Te, Tu, Tm, Tr, T7;
Chris@10 91 Tt = VSUB(Tb, Td);
Chris@10 92 Te = VADD(Tb, Td);
Chris@10 93 Tu = VSUB(Tl, Tj);
Chris@10 94 Tm = VADD(Tj, Tl);
Chris@10 95 Tr = VFNMS(LDK(KP500000000), T6, T1);
Chris@10 96 T7 = VADD(T1, T6);
Chris@10 97 {
Chris@10 98 V Tv, Tf, Ts, Tn;
Chris@10 99 Tv = VFNMS(LDK(KP500000000), Te, T9);
Chris@10 100 Tf = VADD(T9, Te);
Chris@10 101 Ts = VFNMS(LDK(KP500000000), Tm, Th);
Chris@10 102 Tn = VADD(Th, Tm);
Chris@10 103 {
Chris@10 104 V TG, TK, Tw, TJ, TF, TA, To, Tq;
Chris@10 105 TG = VFNMS(LDK(KP726681596), Tt, Tv);
Chris@10 106 TK = VFMA(LDK(KP968908795), Tv, Tt);
Chris@10 107 Tw = VFNMS(LDK(KP586256827), Tv, Tu);
Chris@10 108 TJ = VFNMS(LDK(KP152703644), Tu, Ts);
Chris@10 109 TF = VFMA(LDK(KP203604859), Ts, Tu);
Chris@10 110 TA = VFNMS(LDK(KP439692620), Tt, Ts);
Chris@10 111 To = VADD(Tf, Tn);
Chris@10 112 Tq = VMUL(LDK(KP866025403), VSUB(Tn, Tf));
Chris@10 113 {
Chris@10 114 V TQ, TH, TL, TN, TB, Tp, Ty, TI, Tx;
Chris@10 115 Tx = VFNMS(LDK(KP347296355), Tw, Tt);
Chris@10 116 TQ = VFNMS(LDK(KP898197570), TG, TF);
Chris@10 117 TH = VFMA(LDK(KP898197570), TG, TF);
Chris@10 118 TL = VFMA(LDK(KP673648177), TK, TJ);
Chris@10 119 TN = VFNMS(LDK(KP673648177), TK, TJ);
Chris@10 120 TB = VFNMS(LDK(KP420276625), TA, Tu);
Chris@10 121 ST(&(x[0]), VADD(T7, To), ms, &(x[0]));
Chris@10 122 Tp = VFNMS(LDK(KP500000000), To, T7);
Chris@10 123 Ty = VFNMS(LDK(KP907603734), Tx, Ts);
Chris@10 124 TI = VFMA(LDK(KP852868531), TH, Tr);
Chris@10 125 {
Chris@10 126 V TO, TR, TM, TC, Tz, TP, TS, TE;
Chris@10 127 TO = VFNMS(LDK(KP500000000), TH, TN);
Chris@10 128 TR = VFMA(LDK(KP666666666), TL, TQ);
Chris@10 129 TM = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), TD, TL));
Chris@10 130 TC = VFNMS(LDK(KP826351822), TB, Tv);
Chris@10 131 ST(&(x[WS(rs, 6)]), VFNMSI(Tq, Tp), ms, &(x[0]));
Chris@10 132 ST(&(x[WS(rs, 3)]), VFMAI(Tq, Tp), ms, &(x[WS(rs, 1)]));
Chris@10 133 Tz = VFNMS(LDK(KP939692620), Ty, Tr);
Chris@10 134 TP = VFMA(LDK(KP852868531), TO, Tr);
Chris@10 135 TS = VMUL(LDK(KP866025403), VFMA(LDK(KP852868531), TR, TD));
Chris@10 136 ST(&(x[WS(rs, 8)]), VFMAI(TM, TI), ms, &(x[0]));
Chris@10 137 ST(&(x[WS(rs, 1)]), VFNMSI(TM, TI), ms, &(x[WS(rs, 1)]));
Chris@10 138 TE = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), TD, TC));
Chris@10 139 ST(&(x[WS(rs, 4)]), VFMAI(TS, TP), ms, &(x[0]));
Chris@10 140 ST(&(x[WS(rs, 5)]), VFNMSI(TS, TP), ms, &(x[WS(rs, 1)]));
Chris@10 141 ST(&(x[WS(rs, 7)]), VFMAI(TE, Tz), ms, &(x[WS(rs, 1)]));
Chris@10 142 ST(&(x[WS(rs, 2)]), VFNMSI(TE, Tz), ms, &(x[0]));
Chris@10 143 }
Chris@10 144 }
Chris@10 145 }
Chris@10 146 }
Chris@10 147 }
Chris@10 148 }
Chris@10 149 }
Chris@10 150 VLEAVE();
Chris@10 151 }
Chris@10 152
Chris@10 153 static const tw_instr twinstr[] = {
Chris@10 154 VTW(0, 1),
Chris@10 155 VTW(0, 2),
Chris@10 156 VTW(0, 3),
Chris@10 157 VTW(0, 4),
Chris@10 158 VTW(0, 5),
Chris@10 159 VTW(0, 6),
Chris@10 160 VTW(0, 7),
Chris@10 161 VTW(0, 8),
Chris@10 162 {TW_NEXT, VL, 0}
Chris@10 163 };
Chris@10 164
Chris@10 165 static const ct_desc desc = { 9, XSIMD_STRING("t1fuv_9"), twinstr, &GENUS, {20, 20, 34, 0}, 0, 0, 0 };
Chris@10 166
Chris@10 167 void XSIMD(codelet_t1fuv_9) (planner *p) {
Chris@10 168 X(kdft_dit_register) (p, t1fuv_9, &desc);
Chris@10 169 }
Chris@10 170 #else /* HAVE_FMA */
Chris@10 171
Chris@10 172 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name t1fuv_9 -include t1fu.h */
Chris@10 173
Chris@10 174 /*
Chris@10 175 * This function contains 54 FP additions, 42 FP multiplications,
Chris@10 176 * (or, 38 additions, 26 multiplications, 16 fused multiply/add),
Chris@10 177 * 38 stack variables, 14 constants, and 18 memory accesses
Chris@10 178 */
Chris@10 179 #include "t1fu.h"
Chris@10 180
Chris@10 181 static void t1fuv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 182 {
Chris@10 183 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@10 184 DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
Chris@10 185 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@10 186 DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@10 187 DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
Chris@10 188 DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@10 189 DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@10 190 DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
Chris@10 191 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@10 192 DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
Chris@10 193 DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@10 194 DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
Chris@10 195 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 196 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 197 {
Chris@10 198 INT m;
Chris@10 199 R *x;
Chris@10 200 x = ri;
Chris@10 201 for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) {
Chris@10 202 V T1, T6, TA, Tt, Tf, Ts, Tw, Tn, Tv;
Chris@10 203 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 204 {
Chris@10 205 V T3, T5, T2, T4;
Chris@10 206 T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 207 T3 = BYTWJ(&(W[TWVL * 4]), T2);
Chris@10 208 T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 209 T5 = BYTWJ(&(W[TWVL * 10]), T4);
Chris@10 210 T6 = VADD(T3, T5);
Chris@10 211 TA = VMUL(LDK(KP866025403), VSUB(T5, T3));
Chris@10 212 }
Chris@10 213 {
Chris@10 214 V T9, Td, Tb, T8, Tc, Ta, Te;
Chris@10 215 T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 216 T9 = BYTWJ(&(W[0]), T8);
Chris@10 217 Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 218 Td = BYTWJ(&(W[TWVL * 12]), Tc);
Chris@10 219 Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 220 Tb = BYTWJ(&(W[TWVL * 6]), Ta);
Chris@10 221 Tt = VSUB(Td, Tb);
Chris@10 222 Te = VADD(Tb, Td);
Chris@10 223 Tf = VADD(T9, Te);
Chris@10 224 Ts = VFNMS(LDK(KP500000000), Te, T9);
Chris@10 225 }
Chris@10 226 {
Chris@10 227 V Th, Tl, Tj, Tg, Tk, Ti, Tm;
Chris@10 228 Tg = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 229 Th = BYTWJ(&(W[TWVL * 2]), Tg);
Chris@10 230 Tk = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 231 Tl = BYTWJ(&(W[TWVL * 14]), Tk);
Chris@10 232 Ti = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 233 Tj = BYTWJ(&(W[TWVL * 8]), Ti);
Chris@10 234 Tw = VSUB(Tl, Tj);
Chris@10 235 Tm = VADD(Tj, Tl);
Chris@10 236 Tn = VADD(Th, Tm);
Chris@10 237 Tv = VFNMS(LDK(KP500000000), Tm, Th);
Chris@10 238 }
Chris@10 239 {
Chris@10 240 V Tq, T7, To, Tp;
Chris@10 241 Tq = VBYI(VMUL(LDK(KP866025403), VSUB(Tn, Tf)));
Chris@10 242 T7 = VADD(T1, T6);
Chris@10 243 To = VADD(Tf, Tn);
Chris@10 244 Tp = VFNMS(LDK(KP500000000), To, T7);
Chris@10 245 ST(&(x[0]), VADD(T7, To), ms, &(x[0]));
Chris@10 246 ST(&(x[WS(rs, 3)]), VADD(Tp, Tq), ms, &(x[WS(rs, 1)]));
Chris@10 247 ST(&(x[WS(rs, 6)]), VSUB(Tp, Tq), ms, &(x[0]));
Chris@10 248 }
Chris@10 249 {
Chris@10 250 V TI, TB, TC, TD, Tu, Tx, Ty, Tr, TH;
Chris@10 251 TI = VBYI(VSUB(VFNMS(LDK(KP342020143), Tv, VFNMS(LDK(KP150383733), Tt, VFNMS(LDK(KP984807753), Ts, VMUL(LDK(KP813797681), Tw)))), TA));
Chris@10 252 TB = VFNMS(LDK(KP642787609), Ts, VMUL(LDK(KP663413948), Tt));
Chris@10 253 TC = VFNMS(LDK(KP984807753), Tv, VMUL(LDK(KP150383733), Tw));
Chris@10 254 TD = VADD(TB, TC);
Chris@10 255 Tu = VFMA(LDK(KP766044443), Ts, VMUL(LDK(KP556670399), Tt));
Chris@10 256 Tx = VFMA(LDK(KP173648177), Tv, VMUL(LDK(KP852868531), Tw));
Chris@10 257 Ty = VADD(Tu, Tx);
Chris@10 258 Tr = VFNMS(LDK(KP500000000), T6, T1);
Chris@10 259 TH = VFMA(LDK(KP173648177), Ts, VFNMS(LDK(KP296198132), Tw, VFNMS(LDK(KP939692620), Tv, VFNMS(LDK(KP852868531), Tt, Tr))));
Chris@10 260 ST(&(x[WS(rs, 7)]), VSUB(TH, TI), ms, &(x[WS(rs, 1)]));
Chris@10 261 ST(&(x[WS(rs, 2)]), VADD(TH, TI), ms, &(x[0]));
Chris@10 262 {
Chris@10 263 V Tz, TE, TF, TG;
Chris@10 264 Tz = VADD(Tr, Ty);
Chris@10 265 TE = VBYI(VADD(TA, TD));
Chris@10 266 ST(&(x[WS(rs, 8)]), VSUB(Tz, TE), ms, &(x[0]));
Chris@10 267 ST(&(x[WS(rs, 1)]), VADD(TE, Tz), ms, &(x[WS(rs, 1)]));
Chris@10 268 TF = VFMA(LDK(KP866025403), VSUB(TB, TC), VFNMS(LDK(KP500000000), Ty, Tr));
Chris@10 269 TG = VBYI(VADD(TA, VFNMS(LDK(KP500000000), TD, VMUL(LDK(KP866025403), VSUB(Tx, Tu)))));
Chris@10 270 ST(&(x[WS(rs, 5)]), VSUB(TF, TG), ms, &(x[WS(rs, 1)]));
Chris@10 271 ST(&(x[WS(rs, 4)]), VADD(TF, TG), ms, &(x[0]));
Chris@10 272 }
Chris@10 273 }
Chris@10 274 }
Chris@10 275 }
Chris@10 276 VLEAVE();
Chris@10 277 }
Chris@10 278
Chris@10 279 static const tw_instr twinstr[] = {
Chris@10 280 VTW(0, 1),
Chris@10 281 VTW(0, 2),
Chris@10 282 VTW(0, 3),
Chris@10 283 VTW(0, 4),
Chris@10 284 VTW(0, 5),
Chris@10 285 VTW(0, 6),
Chris@10 286 VTW(0, 7),
Chris@10 287 VTW(0, 8),
Chris@10 288 {TW_NEXT, VL, 0}
Chris@10 289 };
Chris@10 290
Chris@10 291 static const ct_desc desc = { 9, XSIMD_STRING("t1fuv_9"), twinstr, &GENUS, {38, 26, 16, 0}, 0, 0, 0 };
Chris@10 292
Chris@10 293 void XSIMD(codelet_t1fuv_9) (planner *p) {
Chris@10 294 X(kdft_dit_register) (p, t1fuv_9, &desc);
Chris@10 295 }
Chris@10 296 #endif /* HAVE_FMA */