annotate src/fftw-3.3.8/dft/simd/common/n1fv_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:51 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name n1fv_9 -include dft/simd/n1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 46 FP additions, 38 FP multiplications,
Chris@82 32 * (or, 12 additions, 4 multiplications, 34 fused multiply/add),
Chris@82 33 * 50 stack variables, 19 constants, and 18 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1f.h"
Chris@82 36
Chris@82 37 static void n1fv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP666666666, +0.666666666666666666666666666666666666666666667);
Chris@82 40 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@82 41 DVK(KP673648177, +0.673648177666930348851716626769314796000375677);
Chris@82 42 DVK(KP898197570, +0.898197570222573798468955502359086394667167570);
Chris@82 43 DVK(KP879385241, +0.879385241571816768108218554649462939872416269);
Chris@82 44 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@82 45 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@82 46 DVK(KP826351822, +0.826351822333069651148283373230685203999624323);
Chris@82 47 DVK(KP420276625, +0.420276625461206169731530603237061658838781920);
Chris@82 48 DVK(KP907603734, +0.907603734547952313649323976213898122064543220);
Chris@82 49 DVK(KP347296355, +0.347296355333860697703433253538629592000751354);
Chris@82 50 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 51 DVK(KP726681596, +0.726681596905677465811651808188092531873167623);
Chris@82 52 DVK(KP968908795, +0.968908795874236621082202410917456709164223497);
Chris@82 53 DVK(KP586256827, +0.586256827714544512072145703099641959914944179);
Chris@82 54 DVK(KP203604859, +0.203604859554852403062088995281827210665664861);
Chris@82 55 DVK(KP152703644, +0.152703644666139302296566746461370407999248646);
Chris@82 56 DVK(KP439692620, +0.439692620785908384054109277324731469936208134);
Chris@82 57 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 58 {
Chris@82 59 INT i;
Chris@82 60 const R *xi;
Chris@82 61 R *xo;
Chris@82 62 xi = ri;
Chris@82 63 xo = ro;
Chris@82 64 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
Chris@82 65 V T5, Tv, Tj, Tl, Tm, Ta, Tf, Tk, Ts, TB, Tx, Tn, To, TC, Ty;
Chris@82 66 V Ti, Tg, Th;
Chris@82 67 {
Chris@82 68 V T1, T2, T3, T4;
Chris@82 69 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 70 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 71 T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 72 T4 = VADD(T2, T3);
Chris@82 73 T5 = VADD(T1, T4);
Chris@82 74 Tv = VSUB(T3, T2);
Chris@82 75 Tj = VFNMS(LDK(KP500000000), T4, T1);
Chris@82 76 }
Chris@82 77 {
Chris@82 78 V T6, Tb, T9, Te;
Chris@82 79 T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 80 Tb = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 81 {
Chris@82 82 V T7, T8, Tc, Td;
Chris@82 83 T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 84 T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 85 T9 = VADD(T7, T8);
Chris@82 86 Tl = VSUB(T7, T8);
Chris@82 87 Tc = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 88 Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 89 Te = VADD(Tc, Td);
Chris@82 90 Tm = VSUB(Td, Tc);
Chris@82 91 }
Chris@82 92 Ta = VADD(T6, T9);
Chris@82 93 Tf = VADD(Tb, Te);
Chris@82 94 Tk = VFNMS(LDK(KP500000000), Te, Tb);
Chris@82 95 Ts = VFNMS(LDK(KP439692620), Tl, Tk);
Chris@82 96 TB = VFNMS(LDK(KP152703644), Tm, Tk);
Chris@82 97 Tx = VFMA(LDK(KP203604859), Tk, Tm);
Chris@82 98 Tn = VFNMS(LDK(KP500000000), T9, T6);
Chris@82 99 To = VFNMS(LDK(KP586256827), Tn, Tm);
Chris@82 100 TC = VFMA(LDK(KP968908795), Tn, Tl);
Chris@82 101 Ty = VFNMS(LDK(KP726681596), Tl, Tn);
Chris@82 102 }
Chris@82 103 Ti = VMUL(LDK(KP866025403), VSUB(Tf, Ta));
Chris@82 104 Tg = VADD(Ta, Tf);
Chris@82 105 Th = VFNMS(LDK(KP500000000), Tg, T5);
Chris@82 106 ST(&(xo[0]), VADD(T5, Tg), ovs, &(xo[0]));
Chris@82 107 ST(&(xo[WS(os, 3)]), VFMAI(Ti, Th), ovs, &(xo[WS(os, 1)]));
Chris@82 108 ST(&(xo[WS(os, 6)]), VFNMSI(Ti, Th), ovs, &(xo[0]));
Chris@82 109 {
Chris@82 110 V Tq, Tu, Tp, Tt, Tr, Tw;
Chris@82 111 Tp = VFNMS(LDK(KP347296355), To, Tl);
Chris@82 112 Tq = VFNMS(LDK(KP907603734), Tp, Tk);
Chris@82 113 Tt = VFNMS(LDK(KP420276625), Ts, Tm);
Chris@82 114 Tu = VFNMS(LDK(KP826351822), Tt, Tn);
Chris@82 115 Tr = VFNMS(LDK(KP939692620), Tq, Tj);
Chris@82 116 Tw = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tv, Tu));
Chris@82 117 ST(&(xo[WS(os, 2)]), VFNMSI(Tw, Tr), ovs, &(xo[0]));
Chris@82 118 ST(&(xo[WS(os, 7)]), VFMAI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
Chris@82 119 }
Chris@82 120 {
Chris@82 121 V TA, TG, TE, TJ, TH, TK;
Chris@82 122 {
Chris@82 123 V Tz, TF, TD, TI;
Chris@82 124 Tz = VFMA(LDK(KP898197570), Ty, Tx);
Chris@82 125 TF = VFNMS(LDK(KP673648177), TC, TB);
Chris@82 126 TA = VFMA(LDK(KP852868531), Tz, Tj);
Chris@82 127 TG = VFNMS(LDK(KP500000000), Tz, TF);
Chris@82 128 TD = VFMA(LDK(KP673648177), TC, TB);
Chris@82 129 TI = VFNMS(LDK(KP898197570), Ty, Tx);
Chris@82 130 TE = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tv, TD));
Chris@82 131 TJ = VFMA(LDK(KP666666666), TD, TI);
Chris@82 132 }
Chris@82 133 ST(&(xo[WS(os, 1)]), VFNMSI(TE, TA), ovs, &(xo[WS(os, 1)]));
Chris@82 134 ST(&(xo[WS(os, 8)]), VFMAI(TE, TA), ovs, &(xo[0]));
Chris@82 135 TH = VFMA(LDK(KP852868531), TG, Tj);
Chris@82 136 TK = VMUL(LDK(KP866025403), VFMA(LDK(KP852868531), TJ, Tv));
Chris@82 137 ST(&(xo[WS(os, 5)]), VFNMSI(TK, TH), ovs, &(xo[WS(os, 1)]));
Chris@82 138 ST(&(xo[WS(os, 4)]), VFMAI(TK, TH), ovs, &(xo[0]));
Chris@82 139 }
Chris@82 140 }
Chris@82 141 }
Chris@82 142 VLEAVE();
Chris@82 143 }
Chris@82 144
Chris@82 145 static const kdft_desc desc = { 9, XSIMD_STRING("n1fv_9"), {12, 4, 34, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 146
Chris@82 147 void XSIMD(codelet_n1fv_9) (planner *p) {
Chris@82 148 X(kdft_register) (p, n1fv_9, &desc);
Chris@82 149 }
Chris@82 150
Chris@82 151 #else
Chris@82 152
Chris@82 153 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 9 -name n1fv_9 -include dft/simd/n1f.h */
Chris@82 154
Chris@82 155 /*
Chris@82 156 * This function contains 46 FP additions, 26 FP multiplications,
Chris@82 157 * (or, 30 additions, 10 multiplications, 16 fused multiply/add),
Chris@82 158 * 41 stack variables, 14 constants, and 18 memory accesses
Chris@82 159 */
Chris@82 160 #include "dft/simd/n1f.h"
Chris@82 161
Chris@82 162 static void n1fv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 163 {
Chris@82 164 DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@82 165 DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
Chris@82 166 DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@82 167 DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
Chris@82 168 DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@82 169 DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
Chris@82 170 DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
Chris@82 171 DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@82 172 DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@82 173 DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
Chris@82 174 DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@82 175 DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@82 176 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 177 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 178 {
Chris@82 179 INT i;
Chris@82 180 const R *xi;
Chris@82 181 R *xo;
Chris@82 182 xi = ri;
Chris@82 183 xo = ro;
Chris@82 184 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(18, is), MAKE_VOLATILE_STRIDE(18, os)) {
Chris@82 185 V T5, Ts, Tj, To, Tf, Tn, Tp, Tu, Tl, Ta, Tk, Tm, Tt;
Chris@82 186 {
Chris@82 187 V T1, T2, T3, T4;
Chris@82 188 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 189 T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 190 T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 191 T4 = VADD(T2, T3);
Chris@82 192 T5 = VADD(T1, T4);
Chris@82 193 Ts = VMUL(LDK(KP866025403), VSUB(T3, T2));
Chris@82 194 Tj = VFNMS(LDK(KP500000000), T4, T1);
Chris@82 195 }
Chris@82 196 {
Chris@82 197 V Tb, Te, Tc, Td;
Chris@82 198 Tb = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 199 Tc = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 200 Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 201 Te = VADD(Tc, Td);
Chris@82 202 To = VSUB(Td, Tc);
Chris@82 203 Tf = VADD(Tb, Te);
Chris@82 204 Tn = VFNMS(LDK(KP500000000), Te, Tb);
Chris@82 205 Tp = VFMA(LDK(KP173648177), Tn, VMUL(LDK(KP852868531), To));
Chris@82 206 Tu = VFNMS(LDK(KP984807753), Tn, VMUL(LDK(KP150383733), To));
Chris@82 207 }
Chris@82 208 {
Chris@82 209 V T6, T9, T7, T8;
Chris@82 210 T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 211 T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 212 T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 213 T9 = VADD(T7, T8);
Chris@82 214 Tl = VSUB(T8, T7);
Chris@82 215 Ta = VADD(T6, T9);
Chris@82 216 Tk = VFNMS(LDK(KP500000000), T9, T6);
Chris@82 217 Tm = VFMA(LDK(KP766044443), Tk, VMUL(LDK(KP556670399), Tl));
Chris@82 218 Tt = VFNMS(LDK(KP642787609), Tk, VMUL(LDK(KP663413948), Tl));
Chris@82 219 }
Chris@82 220 {
Chris@82 221 V Ti, Tg, Th, Tz, TA;
Chris@82 222 Ti = VBYI(VMUL(LDK(KP866025403), VSUB(Tf, Ta)));
Chris@82 223 Tg = VADD(Ta, Tf);
Chris@82 224 Th = VFNMS(LDK(KP500000000), Tg, T5);
Chris@82 225 ST(&(xo[0]), VADD(T5, Tg), ovs, &(xo[0]));
Chris@82 226 ST(&(xo[WS(os, 3)]), VADD(Th, Ti), ovs, &(xo[WS(os, 1)]));
Chris@82 227 ST(&(xo[WS(os, 6)]), VSUB(Th, Ti), ovs, &(xo[0]));
Chris@82 228 Tz = VFMA(LDK(KP173648177), Tk, VFNMS(LDK(KP296198132), To, VFNMS(LDK(KP939692620), Tn, VFNMS(LDK(KP852868531), Tl, Tj))));
Chris@82 229 TA = VBYI(VSUB(VFNMS(LDK(KP342020143), Tn, VFNMS(LDK(KP150383733), Tl, VFNMS(LDK(KP984807753), Tk, VMUL(LDK(KP813797681), To)))), Ts));
Chris@82 230 ST(&(xo[WS(os, 7)]), VSUB(Tz, TA), ovs, &(xo[WS(os, 1)]));
Chris@82 231 ST(&(xo[WS(os, 2)]), VADD(Tz, TA), ovs, &(xo[0]));
Chris@82 232 {
Chris@82 233 V Tr, Tx, Tw, Ty, Tq, Tv;
Chris@82 234 Tq = VADD(Tm, Tp);
Chris@82 235 Tr = VADD(Tj, Tq);
Chris@82 236 Tx = VFMA(LDK(KP866025403), VSUB(Tt, Tu), VFNMS(LDK(KP500000000), Tq, Tj));
Chris@82 237 Tv = VADD(Tt, Tu);
Chris@82 238 Tw = VBYI(VADD(Ts, Tv));
Chris@82 239 Ty = VBYI(VADD(Ts, VFNMS(LDK(KP500000000), Tv, VMUL(LDK(KP866025403), VSUB(Tp, Tm)))));
Chris@82 240 ST(&(xo[WS(os, 8)]), VSUB(Tr, Tw), ovs, &(xo[0]));
Chris@82 241 ST(&(xo[WS(os, 4)]), VADD(Tx, Ty), ovs, &(xo[0]));
Chris@82 242 ST(&(xo[WS(os, 1)]), VADD(Tw, Tr), ovs, &(xo[WS(os, 1)]));
Chris@82 243 ST(&(xo[WS(os, 5)]), VSUB(Tx, Ty), ovs, &(xo[WS(os, 1)]));
Chris@82 244 }
Chris@82 245 }
Chris@82 246 }
Chris@82 247 }
Chris@82 248 VLEAVE();
Chris@82 249 }
Chris@82 250
Chris@82 251 static const kdft_desc desc = { 9, XSIMD_STRING("n1fv_9"), {30, 10, 16, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 252
Chris@82 253 void XSIMD(codelet_n1fv_9) (planner *p) {
Chris@82 254 X(kdft_register) (p, n1fv_9, &desc);
Chris@82 255 }
Chris@82 256
Chris@82 257 #endif