annotate src/fftw-3.3.5/dft/simd/common/n1fv_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:38:39 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name n1fv_9 -include n1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 46 FP additions, 38 FP multiplications,
Chris@42 32 * (or, 12 additions, 4 multiplications, 34 fused multiply/add),
Chris@42 33 * 68 stack variables, 19 constants, and 18 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1f.h"
Chris@42 36
Chris@42 37 static void n1fv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@42 40 DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
Chris@42 41 DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
Chris@42 42 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@42 43 DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
Chris@42 44 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@42 45 DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
Chris@42 46 DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
Chris@42 47 DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
Chris@42 48 DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
Chris@42 49 DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
Chris@42 50 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 51 DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
Chris@42 52 DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
Chris@42 53 DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
Chris@42 54 DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
Chris@42 55 DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
Chris@42 56 DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
Chris@42 57 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 58 {
Chris@42 59 INT i;
Chris@42 60 const R *xi;
Chris@42 61 R *xo;
Chris@42 62 xi = ri;
Chris@42 63 xo = ro;
Chris@42 64 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
Chris@42 65 V T1, T2, T3, T6, Tb, T7, T8, Tc, Td, Tv, T4;
Chris@42 66 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 67 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 68 T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 69 T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 70 Tb = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 71 T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 72 T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 73 Tc = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 74 Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 75 Tv = VSUB(T3, T2);
Chris@42 76 T4 = VADD(T2, T3);
Chris@42 77 {
Chris@42 78 V Tl, T9, Tm, Te, Tj, T5;
Chris@42 79 Tl = VSUB(T7, T8);
Chris@42 80 T9 = VADD(T7, T8);
Chris@42 81 Tm = VSUB(Td, Tc);
Chris@42 82 Te = VADD(Tc, Td);
Chris@42 83 Tj = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 84 T5 = VADD(T1, T4);
Chris@42 85 {
Chris@42 86 V Tn, Ta, Tk, Tf;
Chris@42 87 Tn = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 88 Ta = VADD(T6, T9);
Chris@42 89 Tk = VFNMS(LDK(KP500000000), Te, Tb);
Chris@42 90 Tf = VADD(Tb, Te);
Chris@42 91 {
Chris@42 92 V Ty, TC, To, TB, Tx, Ts, Tg, Ti;
Chris@42 93 Ty = VFNMS(LDK(KP726681596), Tl, Tn);
Chris@42 94 TC = VFMA(LDK(KP968908795), Tn, Tl);
Chris@42 95 To = VFNMS(LDK(KP586256827), Tn, Tm);
Chris@42 96 TB = VFNMS(LDK(KP152703644), Tm, Tk);
Chris@42 97 Tx = VFMA(LDK(KP203604859), Tk, Tm);
Chris@42 98 Ts = VFNMS(LDK(KP439692620), Tl, Tk);
Chris@42 99 Tg = VADD(Ta, Tf);
Chris@42 100 Ti = VMUL(LDK(KP866025403), VSUB(Tf, Ta));
Chris@42 101 {
Chris@42 102 V Tz, TI, TF, TD, Tt, Th, Tq, Tp;
Chris@42 103 Tp = VFNMS(LDK(KP347296355), To, Tl);
Chris@42 104 Tz = VFMA(LDK(KP898197570), Ty, Tx);
Chris@42 105 TI = VFNMS(LDK(KP898197570), Ty, Tx);
Chris@42 106 TF = VFNMS(LDK(KP673648177), TC, TB);
Chris@42 107 TD = VFMA(LDK(KP673648177), TC, TB);
Chris@42 108 Tt = VFNMS(LDK(KP420276625), Ts, Tm);
Chris@42 109 ST(&(xo[0]), VADD(T5, Tg), ovs, &(xo[0]));
Chris@42 110 Th = VFNMS(LDK(KP500000000), Tg, T5);
Chris@42 111 Tq = VFNMS(LDK(KP907603734), Tp, Tk);
Chris@42 112 {
Chris@42 113 V TA, TJ, TE, TG, Tu, Tr, TK, TH, Tw;
Chris@42 114 TA = VFMA(LDK(KP852868531), Tz, Tj);
Chris@42 115 TJ = VFMA(LDK(KP666666666), TD, TI);
Chris@42 116 TE = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tv, TD));
Chris@42 117 TG = VFNMS(LDK(KP500000000), Tz, TF);
Chris@42 118 Tu = VFNMS(LDK(KP826351822), Tt, Tn);
Chris@42 119 ST(&(xo[WS(os, 6)]), VFNMSI(Ti, Th), ovs, &(xo[0]));
Chris@42 120 ST(&(xo[WS(os, 3)]), VFMAI(Ti, Th), ovs, &(xo[WS(os, 1)]));
Chris@42 121 Tr = VFNMS(LDK(KP939692620), Tq, Tj);
Chris@42 122 TK = VMUL(LDK(KP866025403), VFMA(LDK(KP852868531), TJ, Tv));
Chris@42 123 ST(&(xo[WS(os, 8)]), VFMAI(TE, TA), ovs, &(xo[0]));
Chris@42 124 ST(&(xo[WS(os, 1)]), VFNMSI(TE, TA), ovs, &(xo[WS(os, 1)]));
Chris@42 125 TH = VFMA(LDK(KP852868531), TG, Tj);
Chris@42 126 Tw = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tv, Tu));
Chris@42 127 ST(&(xo[WS(os, 4)]), VFMAI(TK, TH), ovs, &(xo[0]));
Chris@42 128 ST(&(xo[WS(os, 5)]), VFNMSI(TK, TH), ovs, &(xo[WS(os, 1)]));
Chris@42 129 ST(&(xo[WS(os, 7)]), VFMAI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
Chris@42 130 ST(&(xo[WS(os, 2)]), VFNMSI(Tw, Tr), ovs, &(xo[0]));
Chris@42 131 }
Chris@42 132 }
Chris@42 133 }
Chris@42 134 }
Chris@42 135 }
Chris@42 136 }
Chris@42 137 }
Chris@42 138 VLEAVE();
Chris@42 139 }
Chris@42 140
Chris@42 141 static const kdft_desc desc = { 9, XSIMD_STRING("n1fv_9"), {12, 4, 34, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 142
Chris@42 143 void XSIMD(codelet_n1fv_9) (planner *p) {
Chris@42 144 X(kdft_register) (p, n1fv_9, &desc);
Chris@42 145 }
Chris@42 146
Chris@42 147 #else /* HAVE_FMA */
Chris@42 148
Chris@42 149 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name n1fv_9 -include n1f.h */
Chris@42 150
Chris@42 151 /*
Chris@42 152 * This function contains 46 FP additions, 26 FP multiplications,
Chris@42 153 * (or, 30 additions, 10 multiplications, 16 fused multiply/add),
Chris@42 154 * 41 stack variables, 14 constants, and 18 memory accesses
Chris@42 155 */
Chris@42 156 #include "n1f.h"
Chris@42 157
Chris@42 158 static void n1fv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 159 {
Chris@42 160 DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@42 161 DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
Chris@42 162 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@42 163 DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
Chris@42 164 DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@42 165 DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
Chris@42 166 DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
Chris@42 167 DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@42 168 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@42 169 DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
Chris@42 170 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@42 171 DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@42 172 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 173 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 174 {
Chris@42 175 INT i;
Chris@42 176 const R *xi;
Chris@42 177 R *xo;
Chris@42 178 xi = ri;
Chris@42 179 xo = ro;
Chris@42 180 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
Chris@42 181 V T5, Ts, Tj, To, Tf, Tn, Tp, Tu, Tl, Ta, Tk, Tm, Tt;
Chris@42 182 {
Chris@42 183 V T1, T2, T3, T4;
Chris@42 184 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 185 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 186 T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 187 T4 = VADD(T2, T3);
Chris@42 188 T5 = VADD(T1, T4);
Chris@42 189 Ts = VMUL(LDK(KP866025403), VSUB(T3, T2));
Chris@42 190 Tj = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 191 }
Chris@42 192 {
Chris@42 193 V Tb, Te, Tc, Td;
Chris@42 194 Tb = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 195 Tc = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 196 Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 197 Te = VADD(Tc, Td);
Chris@42 198 To = VSUB(Td, Tc);
Chris@42 199 Tf = VADD(Tb, Te);
Chris@42 200 Tn = VFNMS(LDK(KP500000000), Te, Tb);
Chris@42 201 Tp = VFMA(LDK(KP173648177), Tn, VMUL(LDK(KP852868531), To));
Chris@42 202 Tu = VFNMS(LDK(KP984807753), Tn, VMUL(LDK(KP150383733), To));
Chris@42 203 }
Chris@42 204 {
Chris@42 205 V T6, T9, T7, T8;
Chris@42 206 T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 207 T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 208 T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 209 T9 = VADD(T7, T8);
Chris@42 210 Tl = VSUB(T8, T7);
Chris@42 211 Ta = VADD(T6, T9);
Chris@42 212 Tk = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 213 Tm = VFMA(LDK(KP766044443), Tk, VMUL(LDK(KP556670399), Tl));
Chris@42 214 Tt = VFNMS(LDK(KP642787609), Tk, VMUL(LDK(KP663413948), Tl));
Chris@42 215 }
Chris@42 216 {
Chris@42 217 V Ti, Tg, Th, Tz, TA;
Chris@42 218 Ti = VBYI(VMUL(LDK(KP866025403), VSUB(Tf, Ta)));
Chris@42 219 Tg = VADD(Ta, Tf);
Chris@42 220 Th = VFNMS(LDK(KP500000000), Tg, T5);
Chris@42 221 ST(&(xo[0]), VADD(T5, Tg), ovs, &(xo[0]));
Chris@42 222 ST(&(xo[WS(os, 3)]), VADD(Th, Ti), ovs, &(xo[WS(os, 1)]));
Chris@42 223 ST(&(xo[WS(os, 6)]), VSUB(Th, Ti), ovs, &(xo[0]));
Chris@42 224 Tz = VFMA(LDK(KP173648177), Tk, VFNMS(LDK(KP296198132), To, VFNMS(LDK(KP939692620), Tn, VFNMS(LDK(KP852868531), Tl, Tj))));
Chris@42 225 TA = VBYI(VSUB(VFNMS(LDK(KP342020143), Tn, VFNMS(LDK(KP150383733), Tl, VFNMS(LDK(KP984807753), Tk, VMUL(LDK(KP813797681), To)))), Ts));
Chris@42 226 ST(&(xo[WS(os, 7)]), VSUB(Tz, TA), ovs, &(xo[WS(os, 1)]));
Chris@42 227 ST(&(xo[WS(os, 2)]), VADD(Tz, TA), ovs, &(xo[0]));
Chris@42 228 {
Chris@42 229 V Tr, Tx, Tw, Ty, Tq, Tv;
Chris@42 230 Tq = VADD(Tm, Tp);
Chris@42 231 Tr = VADD(Tj, Tq);
Chris@42 232 Tx = VFMA(LDK(KP866025403), VSUB(Tt, Tu), VFNMS(LDK(KP500000000), Tq, Tj));
Chris@42 233 Tv = VADD(Tt, Tu);
Chris@42 234 Tw = VBYI(VADD(Ts, Tv));
Chris@42 235 Ty = VBYI(VADD(Ts, VFNMS(LDK(KP500000000), Tv, VMUL(LDK(KP866025403), VSUB(Tp, Tm)))));
Chris@42 236 ST(&(xo[WS(os, 8)]), VSUB(Tr, Tw), ovs, &(xo[0]));
Chris@42 237 ST(&(xo[WS(os, 4)]), VADD(Tx, Ty), ovs, &(xo[0]));
Chris@42 238 ST(&(xo[WS(os, 1)]), VADD(Tw, Tr), ovs, &(xo[WS(os, 1)]));
Chris@42 239 ST(&(xo[WS(os, 5)]), VSUB(Tx, Ty), ovs, &(xo[WS(os, 1)]));
Chris@42 240 }
Chris@42 241 }
Chris@42 242 }
Chris@42 243 }
Chris@42 244 VLEAVE();
Chris@42 245 }
Chris@42 246
Chris@42 247 static const kdft_desc desc = { 9, XSIMD_STRING("n1fv_9"), {30, 10, 16, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 248
Chris@42 249 void XSIMD(codelet_n1fv_9) (planner *p) {
Chris@42 250 X(kdft_register) (p, n1fv_9, &desc);
Chris@42 251 }
Chris@42 252
Chris@42 253 #endif /* HAVE_FMA */