annotate src/fftw-3.3.8/dft/simd/common/n1bv_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:55 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 9 -name n1bv_9 -include dft/simd/n1b.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 46 FP additions, 38 FP multiplications,
Chris@82 32 * (or, 12 additions, 4 multiplications, 34 fused multiply/add),
Chris@82 33 * 50 stack variables, 19 constants, and 18 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1b.h"
Chris@82 36
Chris@82 37 static void n1bv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
Chris@82 40 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@82 41 DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
Chris@82 42 DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
Chris@82 43 DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
Chris@82 44 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@82 45 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@82 46 DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
Chris@82 47 DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
Chris@82 48 DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
Chris@82 49 DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
Chris@82 50 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 51 DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
Chris@82 52 DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
Chris@82 53 DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
Chris@82 54 DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
Chris@82 55 DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
Chris@82 56 DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
Chris@82 57 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 58 {
Chris@82 59 INT i;
Chris@82 60 const R *xi;
Chris@82 61 R *xo;
Chris@82 62 xi = ii;
Chris@82 63 xo = io;
Chris@82 64 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
Chris@82 65 V T5, TF, Tp, Te, Td, TG, TH, Ta, Tm, Tu, Tr, Th, Ti, Tv, Ts;
Chris@82 66 V TK, TI, TJ;
Chris@82 67 {
Chris@82 68 V T1, T2, T3, T4;
Chris@82 69 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 70 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 71 T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 72 T4 = VADD(T2, T3);
Chris@82 73 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@82 74 TF = VADD(T1, T4);
Chris@82 75 Tp = VSUB(T2, T3);
Chris@82 76 }
Chris@82 77 {
Chris@82 78 V T6, Tf, T9, Tg;
Chris@82 79 T6 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 80 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 81 {
Chris@82 82 V T7, T8, Tb, Tc;
Chris@82 83 T7 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 84 T8 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 85 T9 = VADD(T7, T8);
Chris@82 86 Te = VSUB(T8, T7);
Chris@82 87 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 88 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 89 Td = VSUB(Tb, Tc);
Chris@82 90 Tg = VADD(Tb, Tc);
Chris@82 91 }
Chris@82 92 TG = VADD(Tf, Tg);
Chris@82 93 TH = VADD(T6, T9);
Chris@82 94 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@82 95 Tm = VFNMS(LDK(KP439692620), Td, Ta);
Chris@82 96 Tu = VFMA(LDK(KP203604859), Ta, Te);
Chris@82 97 Tr = VFNMS(LDK(KP152703644), Te, Ta);
Chris@82 98 Th = VFNMS(LDK(KP500000000), Tg, Tf);
Chris@82 99 Ti = VFNMS(LDK(KP586256827), Th, Te);
Chris@82 100 Tv = VFNMS(LDK(KP726681596), Td, Th);
Chris@82 101 Ts = VFMA(LDK(KP968908795), Th, Td);
Chris@82 102 }
Chris@82 103 TK = VMUL(LDK(KP866025403), VSUB(TG, TH));
Chris@82 104 TI = VADD(TG, TH);
Chris@82 105 TJ = VFNMS(LDK(KP500000000), TI, TF);
Chris@82 106 ST(&(xo[WS(os, 3)]), VFMAI(TK, TJ), ovs, &(xo[WS(os, 1)]));
Chris@82 107 ST(&(xo[0]), VADD(TI, TF), ovs, &(xo[0]));
Chris@82 108 ST(&(xo[WS(os, 6)]), VFNMSI(TK, TJ), ovs, &(xo[0]));
Chris@82 109 {
Chris@82 110 V Tk, To, Tj, Tn, Tl, Tq;
Chris@82 111 Tj = VFNMS(LDK(KP347296355), Ti, Td);
Chris@82 112 Tk = VFNMS(LDK(KP907603734), Tj, Ta);
Chris@82 113 Tn = VFNMS(LDK(KP420276625), Tm, Te);
Chris@82 114 To = VFNMS(LDK(KP826351822), Tn, Th);
Chris@82 115 Tl = VFNMS(LDK(KP939692620), Tk, T5);
Chris@82 116 Tq = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tp, To));
Chris@82 117 ST(&(xo[WS(os, 7)]), VFNMSI(Tq, Tl), ovs, &(xo[WS(os, 1)]));
Chris@82 118 ST(&(xo[WS(os, 2)]), VFMAI(Tq, Tl), ovs, &(xo[0]));
Chris@82 119 }
Chris@82 120 {
Chris@82 121 V Tx, TD, TB, TE, Ty, TC;
Chris@82 122 {
Chris@82 123 V Tt, Tw, Tz, TA;
Chris@82 124 Tt = VFNMS(LDK(KP673648177), Ts, Tr);
Chris@82 125 Tw = VFMA(LDK(KP898197570), Tv, Tu);
Chris@82 126 Tx = VFNMS(LDK(KP500000000), Tw, Tt);
Chris@82 127 TD = VFMA(LDK(KP852868531), Tw, T5);
Chris@82 128 Tz = VFNMS(LDK(KP898197570), Tv, Tu);
Chris@82 129 TA = VFMA(LDK(KP673648177), Ts, Tr);
Chris@82 130 TB = VFMA(LDK(KP666666666), TA, Tz);
Chris@82 131 TE = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tp, TA));
Chris@82 132 }
Chris@82 133 ST(&(xo[WS(os, 1)]), VFMAI(TE, TD), ovs, &(xo[WS(os, 1)]));
Chris@82 134 ST(&(xo[WS(os, 8)]), VFNMSI(TE, TD), ovs, &(xo[0]));
Chris@82 135 Ty = VFMA(LDK(KP852868531), Tx, T5);
Chris@82 136 TC = VMUL(LDK(KP866025403), VFNMS(LDK(KP852868531), TB, Tp));
Chris@82 137 ST(&(xo[WS(os, 4)]), VFMAI(TC, Ty), ovs, &(xo[0]));
Chris@82 138 ST(&(xo[WS(os, 5)]), VFNMSI(TC, Ty), ovs, &(xo[WS(os, 1)]));
Chris@82 139 }
Chris@82 140 }
Chris@82 141 }
Chris@82 142 VLEAVE();
Chris@82 143 }
Chris@82 144
Chris@82 145 static const kdft_desc desc = { 9, XSIMD_STRING("n1bv_9"), {12, 4, 34, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 146
Chris@82 147 void XSIMD(codelet_n1bv_9) (planner *p) {
Chris@82 148 X(kdft_register) (p, n1bv_9, &desc);
Chris@82 149 }
Chris@82 150
Chris@82 151 #else
Chris@82 152
Chris@82 153 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 9 -name n1bv_9 -include dft/simd/n1b.h */
Chris@82 154
Chris@82 155 /*
Chris@82 156 * This function contains 46 FP additions, 26 FP multiplications,
Chris@82 157 * (or, 30 additions, 10 multiplications, 16 fused multiply/add),
Chris@82 158 * 41 stack variables, 14 constants, and 18 memory accesses
Chris@82 159 */
Chris@82 160 #include "dft/simd/n1b.h"
Chris@82 161
Chris@82 162 static void n1bv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 163 {
Chris@82 164 DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@82 165 DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
Chris@82 166 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@82 167 DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
Chris@82 168 DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@82 169 DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
Chris@82 170 DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
Chris@82 171 DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@82 172 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@82 173 DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
Chris@82 174 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@82 175 DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@82 176 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 177 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 178 {
Chris@82 179 INT i;
Chris@82 180 const R *xi;
Chris@82 181 R *xo;
Chris@82 182 xi = ii;
Chris@82 183 xo = io;
Chris@82 184 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
Chris@82 185 V T5, Ty, Tm, Ti, Tw, Th, Tj, To, Tb, Tv, Ta, Tc, Tn;
Chris@82 186 {
Chris@82 187 V T1, T2, T3, T4;
Chris@82 188 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 189 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 190 T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 191 T4 = VADD(T2, T3);
Chris@82 192 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@82 193 Ty = VADD(T1, T4);
Chris@82 194 Tm = VMUL(LDK(KP866025403), VSUB(T2, T3));
Chris@82 195 }
Chris@82 196 {
Chris@82 197 V Td, Tg, Te, Tf;
Chris@82 198 Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 199 Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 200 Tf = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 201 Tg = VADD(Te, Tf);
Chris@82 202 Ti = VSUB(Te, Tf);
Chris@82 203 Tw = VADD(Td, Tg);
Chris@82 204 Th = VFNMS(LDK(KP500000000), Tg, Td);
Chris@82 205 Tj = VFNMS(LDK(KP852868531), Ti, VMUL(LDK(KP173648177), Th));
Chris@82 206 To = VFMA(LDK(KP150383733), Ti, VMUL(LDK(KP984807753), Th));
Chris@82 207 }
Chris@82 208 {
Chris@82 209 V T6, T9, T7, T8;
Chris@82 210 T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 211 T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 212 T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 213 T9 = VADD(T7, T8);
Chris@82 214 Tb = VSUB(T7, T8);
Chris@82 215 Tv = VADD(T6, T9);
Chris@82 216 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@82 217 Tc = VFNMS(LDK(KP556670399), Tb, VMUL(LDK(KP766044443), Ta));
Chris@82 218 Tn = VFMA(LDK(KP663413948), Tb, VMUL(LDK(KP642787609), Ta));
Chris@82 219 }
Chris@82 220 {
Chris@82 221 V Tx, Tz, TA, Tt, Tu;
Chris@82 222 Tx = VBYI(VMUL(LDK(KP866025403), VSUB(Tv, Tw)));
Chris@82 223 Tz = VADD(Tv, Tw);
Chris@82 224 TA = VFNMS(LDK(KP500000000), Tz, Ty);
Chris@82 225 ST(&(xo[WS(os, 3)]), VADD(Tx, TA), ovs, &(xo[WS(os, 1)]));
Chris@82 226 ST(&(xo[0]), VADD(Ty, Tz), ovs, &(xo[0]));
Chris@82 227 ST(&(xo[WS(os, 6)]), VSUB(TA, Tx), ovs, &(xo[0]));
Chris@82 228 Tt = VFMA(LDK(KP852868531), Tb, VFMA(LDK(KP173648177), Ta, VFMA(LDK(KP296198132), Ti, VFNMS(LDK(KP939692620), Th, T5))));
Chris@82 229 Tu = VBYI(VSUB(VFMA(LDK(KP984807753), Ta, VFMA(LDK(KP813797681), Ti, VFNMS(LDK(KP150383733), Tb, VMUL(LDK(KP342020143), Th)))), Tm));
Chris@82 230 ST(&(xo[WS(os, 7)]), VSUB(Tt, Tu), ovs, &(xo[WS(os, 1)]));
Chris@82 231 ST(&(xo[WS(os, 2)]), VADD(Tt, Tu), ovs, &(xo[0]));
Chris@82 232 {
Chris@82 233 V Tl, Ts, Tq, Tr, Tk, Tp;
Chris@82 234 Tk = VADD(Tc, Tj);
Chris@82 235 Tl = VADD(T5, Tk);
Chris@82 236 Ts = VFMA(LDK(KP866025403), VSUB(To, Tn), VFNMS(LDK(KP500000000), Tk, T5));
Chris@82 237 Tp = VADD(Tn, To);
Chris@82 238 Tq = VBYI(VADD(Tm, Tp));
Chris@82 239 Tr = VBYI(VADD(Tm, VFNMS(LDK(KP500000000), Tp, VMUL(LDK(KP866025403), VSUB(Tc, Tj)))));
Chris@82 240 ST(&(xo[WS(os, 8)]), VSUB(Tl, Tq), ovs, &(xo[0]));
Chris@82 241 ST(&(xo[WS(os, 5)]), VSUB(Ts, Tr), ovs, &(xo[WS(os, 1)]));
Chris@82 242 ST(&(xo[WS(os, 1)]), VADD(Tl, Tq), ovs, &(xo[WS(os, 1)]));
Chris@82 243 ST(&(xo[WS(os, 4)]), VADD(Tr, Ts), ovs, &(xo[0]));
Chris@82 244 }
Chris@82 245 }
Chris@82 246 }
Chris@82 247 }
Chris@82 248 VLEAVE();
Chris@82 249 }
Chris@82 250
Chris@82 251 static const kdft_desc desc = { 9, XSIMD_STRING("n1bv_9"), {30, 10, 16, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 252
Chris@82 253 void XSIMD(codelet_n1bv_9) (planner *p) {
Chris@82 254 X(kdft_register) (p, n1bv_9, &desc);
Chris@82 255 }
Chris@82 256
Chris@82 257 #endif