annotate src/fftw-3.3.5/dft/simd/common/n1bv_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:39:03 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 9 -name n1bv_9 -include n1b.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 46 FP additions, 38 FP multiplications,
Chris@42 32 * (or, 12 additions, 4 multiplications, 34 fused multiply/add),
Chris@42 33 * 68 stack variables, 19 constants, and 18 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1b.h"
Chris@42 36
Chris@42 37 static void n1bv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@42 40 DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
Chris@42 41 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@42 42 DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
Chris@42 43 DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
Chris@42 44 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@42 45 DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
Chris@42 46 DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
Chris@42 47 DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
Chris@42 48 DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
Chris@42 49 DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
Chris@42 50 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 51 DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
Chris@42 52 DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
Chris@42 53 DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
Chris@42 54 DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
Chris@42 55 DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
Chris@42 56 DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
Chris@42 57 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 58 {
Chris@42 59 INT i;
Chris@42 60 const R *xi;
Chris@42 61 R *xo;
Chris@42 62 xi = ii;
Chris@42 63 xo = io;
Chris@42 64 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
Chris@42 65 V T1, T2, T3, T6, Tf, T7, T8, Tb, Tc, Tp, T4;
Chris@42 66 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 67 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 68 T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 69 T6 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 70 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 71 T7 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 72 T8 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 73 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 74 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 75 Tp = VSUB(T2, T3);
Chris@42 76 T4 = VADD(T2, T3);
Chris@42 77 {
Chris@42 78 V Te, T9, Tg, Td, TF, T5;
Chris@42 79 Te = VSUB(T8, T7);
Chris@42 80 T9 = VADD(T7, T8);
Chris@42 81 Tg = VADD(Tb, Tc);
Chris@42 82 Td = VSUB(Tb, Tc);
Chris@42 83 TF = VADD(T1, T4);
Chris@42 84 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 85 {
Chris@42 86 V Ta, TH, Th, TG;
Chris@42 87 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 88 TH = VADD(T6, T9);
Chris@42 89 Th = VFNMS(LDK(KP500000000), Tg, Tf);
Chris@42 90 TG = VADD(Tf, Tg);
Chris@42 91 {
Chris@42 92 V Tr, Tu, Tm, Tv, Ts, Ti, TI, TK;
Chris@42 93 Tr = VFNMS(LDK(KP152703644), Te, Ta);
Chris@42 94 Tu = VFMA(LDK(KP203604859), Ta, Te);
Chris@42 95 Tm = VFNMS(LDK(KP439692620), Td, Ta);
Chris@42 96 Tv = VFNMS(LDK(KP726681596), Td, Th);
Chris@42 97 Ts = VFMA(LDK(KP968908795), Th, Td);
Chris@42 98 Ti = VFNMS(LDK(KP586256827), Th, Te);
Chris@42 99 TI = VADD(TG, TH);
Chris@42 100 TK = VMUL(LDK(KP866025403), VSUB(TG, TH));
Chris@42 101 {
Chris@42 102 V Tt, TA, Tw, Tz, Tj, TJ, To, TE, Tn;
Chris@42 103 Tn = VFNMS(LDK(KP420276625), Tm, Te);
Chris@42 104 Tt = VFNMS(LDK(KP673648177), Ts, Tr);
Chris@42 105 TA = VFMA(LDK(KP673648177), Ts, Tr);
Chris@42 106 Tw = VFMA(LDK(KP898197570), Tv, Tu);
Chris@42 107 Tz = VFNMS(LDK(KP898197570), Tv, Tu);
Chris@42 108 Tj = VFNMS(LDK(KP347296355), Ti, Td);
Chris@42 109 ST(&(xo[0]), VADD(TI, TF), ovs, &(xo[0]));
Chris@42 110 TJ = VFNMS(LDK(KP500000000), TI, TF);
Chris@42 111 To = VFNMS(LDK(KP826351822), Tn, Th);
Chris@42 112 TE = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tp, TA));
Chris@42 113 {
Chris@42 114 V TB, TD, Tx, Tk, Tq, TC, Ty, Tl;
Chris@42 115 TB = VFMA(LDK(KP666666666), TA, Tz);
Chris@42 116 TD = VFMA(LDK(KP852868531), Tw, T5);
Chris@42 117 Tx = VFNMS(LDK(KP500000000), Tw, Tt);
Chris@42 118 Tk = VFNMS(LDK(KP907603734), Tj, Ta);
Chris@42 119 ST(&(xo[WS(os, 6)]), VFNMSI(TK, TJ), ovs, &(xo[0]));
Chris@42 120 ST(&(xo[WS(os, 3)]), VFMAI(TK, TJ), ovs, &(xo[WS(os, 1)]));
Chris@42 121 Tq = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tp, To));
Chris@42 122 TC = VMUL(LDK(KP866025403), VFNMS(LDK(KP852868531), TB, Tp));
Chris@42 123 ST(&(xo[WS(os, 8)]), VFNMSI(TE, TD), ovs, &(xo[0]));
Chris@42 124 ST(&(xo[WS(os, 1)]), VFMAI(TE, TD), ovs, &(xo[WS(os, 1)]));
Chris@42 125 Ty = VFMA(LDK(KP852868531), Tx, T5);
Chris@42 126 Tl = VFNMS(LDK(KP939692620), Tk, T5);
Chris@42 127 ST(&(xo[WS(os, 5)]), VFNMSI(TC, Ty), ovs, &(xo[WS(os, 1)]));
Chris@42 128 ST(&(xo[WS(os, 4)]), VFMAI(TC, Ty), ovs, &(xo[0]));
Chris@42 129 ST(&(xo[WS(os, 2)]), VFMAI(Tq, Tl), ovs, &(xo[0]));
Chris@42 130 ST(&(xo[WS(os, 7)]), VFNMSI(Tq, Tl), ovs, &(xo[WS(os, 1)]));
Chris@42 131 }
Chris@42 132 }
Chris@42 133 }
Chris@42 134 }
Chris@42 135 }
Chris@42 136 }
Chris@42 137 }
Chris@42 138 VLEAVE();
Chris@42 139 }
Chris@42 140
Chris@42 141 static const kdft_desc desc = { 9, XSIMD_STRING("n1bv_9"), {12, 4, 34, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 142
Chris@42 143 void XSIMD(codelet_n1bv_9) (planner *p) {
Chris@42 144 X(kdft_register) (p, n1bv_9, &desc);
Chris@42 145 }
Chris@42 146
Chris@42 147 #else /* HAVE_FMA */
Chris@42 148
Chris@42 149 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 9 -name n1bv_9 -include n1b.h */
Chris@42 150
Chris@42 151 /*
Chris@42 152 * This function contains 46 FP additions, 26 FP multiplications,
Chris@42 153 * (or, 30 additions, 10 multiplications, 16 fused multiply/add),
Chris@42 154 * 41 stack variables, 14 constants, and 18 memory accesses
Chris@42 155 */
Chris@42 156 #include "n1b.h"
Chris@42 157
Chris@42 158 static void n1bv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 159 {
Chris@42 160 DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@42 161 DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
Chris@42 162 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@42 163 DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
Chris@42 164 DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@42 165 DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
Chris@42 166 DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
Chris@42 167 DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@42 168 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@42 169 DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
Chris@42 170 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@42 171 DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@42 172 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 173 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 174 {
Chris@42 175 INT i;
Chris@42 176 const R *xi;
Chris@42 177 R *xo;
Chris@42 178 xi = ii;
Chris@42 179 xo = io;
Chris@42 180 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
Chris@42 181 V T5, Ty, Tm, Ti, Tw, Th, Tj, To, Tb, Tv, Ta, Tc, Tn;
Chris@42 182 {
Chris@42 183 V T1, T2, T3, T4;
Chris@42 184 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 185 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 186 T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 187 T4 = VADD(T2, T3);
Chris@42 188 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 189 Ty = VADD(T1, T4);
Chris@42 190 Tm = VMUL(LDK(KP866025403), VSUB(T2, T3));
Chris@42 191 }
Chris@42 192 {
Chris@42 193 V Td, Tg, Te, Tf;
Chris@42 194 Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 195 Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 196 Tf = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 197 Tg = VADD(Te, Tf);
Chris@42 198 Ti = VSUB(Te, Tf);
Chris@42 199 Tw = VADD(Td, Tg);
Chris@42 200 Th = VFNMS(LDK(KP500000000), Tg, Td);
Chris@42 201 Tj = VFNMS(LDK(KP852868531), Ti, VMUL(LDK(KP173648177), Th));
Chris@42 202 To = VFMA(LDK(KP150383733), Ti, VMUL(LDK(KP984807753), Th));
Chris@42 203 }
Chris@42 204 {
Chris@42 205 V T6, T9, T7, T8;
Chris@42 206 T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 207 T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 208 T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 209 T9 = VADD(T7, T8);
Chris@42 210 Tb = VSUB(T7, T8);
Chris@42 211 Tv = VADD(T6, T9);
Chris@42 212 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 213 Tc = VFNMS(LDK(KP556670399), Tb, VMUL(LDK(KP766044443), Ta));
Chris@42 214 Tn = VFMA(LDK(KP663413948), Tb, VMUL(LDK(KP642787609), Ta));
Chris@42 215 }
Chris@42 216 {
Chris@42 217 V Tx, Tz, TA, Tt, Tu;
Chris@42 218 Tx = VBYI(VMUL(LDK(KP866025403), VSUB(Tv, Tw)));
Chris@42 219 Tz = VADD(Tv, Tw);
Chris@42 220 TA = VFNMS(LDK(KP500000000), Tz, Ty);
Chris@42 221 ST(&(xo[WS(os, 3)]), VADD(Tx, TA), ovs, &(xo[WS(os, 1)]));
Chris@42 222 ST(&(xo[0]), VADD(Ty, Tz), ovs, &(xo[0]));
Chris@42 223 ST(&(xo[WS(os, 6)]), VSUB(TA, Tx), ovs, &(xo[0]));
Chris@42 224 Tt = VFMA(LDK(KP852868531), Tb, VFMA(LDK(KP173648177), Ta, VFMA(LDK(KP296198132), Ti, VFNMS(LDK(KP939692620), Th, T5))));
Chris@42 225 Tu = VBYI(VSUB(VFMA(LDK(KP984807753), Ta, VFMA(LDK(KP813797681), Ti, VFNMS(LDK(KP150383733), Tb, VMUL(LDK(KP342020143), Th)))), Tm));
Chris@42 226 ST(&(xo[WS(os, 7)]), VSUB(Tt, Tu), ovs, &(xo[WS(os, 1)]));
Chris@42 227 ST(&(xo[WS(os, 2)]), VADD(Tt, Tu), ovs, &(xo[0]));
Chris@42 228 {
Chris@42 229 V Tl, Ts, Tq, Tr, Tk, Tp;
Chris@42 230 Tk = VADD(Tc, Tj);
Chris@42 231 Tl = VADD(T5, Tk);
Chris@42 232 Ts = VFMA(LDK(KP866025403), VSUB(To, Tn), VFNMS(LDK(KP500000000), Tk, T5));
Chris@42 233 Tp = VADD(Tn, To);
Chris@42 234 Tq = VBYI(VADD(Tm, Tp));
Chris@42 235 Tr = VBYI(VADD(Tm, VFNMS(LDK(KP500000000), Tp, VMUL(LDK(KP866025403), VSUB(Tc, Tj)))));
Chris@42 236 ST(&(xo[WS(os, 8)]), VSUB(Tl, Tq), ovs, &(xo[0]));
Chris@42 237 ST(&(xo[WS(os, 5)]), VSUB(Ts, Tr), ovs, &(xo[WS(os, 1)]));
Chris@42 238 ST(&(xo[WS(os, 1)]), VADD(Tl, Tq), ovs, &(xo[WS(os, 1)]));
Chris@42 239 ST(&(xo[WS(os, 4)]), VADD(Tr, Ts), ovs, &(xo[0]));
Chris@42 240 }
Chris@42 241 }
Chris@42 242 }
Chris@42 243 }
Chris@42 244 VLEAVE();
Chris@42 245 }
Chris@42 246
Chris@42 247 static const kdft_desc desc = { 9, XSIMD_STRING("n1bv_9"), {30, 10, 16, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 248
Chris@42 249 void XSIMD(codelet_n1bv_9) (planner *p) {
Chris@42 250 X(kdft_register) (p, n1bv_9, &desc);
Chris@42 251 }
Chris@42 252
Chris@42 253 #endif /* HAVE_FMA */