annotate src/fftw-3.3.5/dft/simd/common/n1fv_14.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:38:41 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 14 -name n1fv_14 -include n1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 74 FP additions, 48 FP multiplications,
Chris@42 32 * (or, 32 additions, 6 multiplications, 42 fused multiply/add),
Chris@42 33 * 63 stack variables, 6 constants, and 28 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1f.h"
Chris@42 36
Chris@42 37 static void n1fv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 40 DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@42 41 DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 42 DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@42 43 DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@42 44 DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@42 45 {
Chris@42 46 INT i;
Chris@42 47 const R *xi;
Chris@42 48 R *xo;
Chris@42 49 xi = ri;
Chris@42 50 xo = ro;
Chris@42 51 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
Chris@42 52 V TH, T3, TP, Tn, Ta, Ts, TW, TK, TO, Tk, TM, Tg, TL, Td, T1;
Chris@42 53 V T2;
Chris@42 54 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 55 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 56 {
Chris@42 57 V Ti, TI, T6, TJ, T9, Tj, Te, Tf, Tb, Tc;
Chris@42 58 {
Chris@42 59 V T4, T5, T7, T8, Tl, Tm;
Chris@42 60 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 61 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 62 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 63 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 64 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 65 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 66 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 67 TH = VADD(T1, T2);
Chris@42 68 T3 = VSUB(T1, T2);
Chris@42 69 TI = VADD(T4, T5);
Chris@42 70 T6 = VSUB(T4, T5);
Chris@42 71 TJ = VADD(T7, T8);
Chris@42 72 T9 = VSUB(T7, T8);
Chris@42 73 TP = VADD(Tl, Tm);
Chris@42 74 Tn = VSUB(Tl, Tm);
Chris@42 75 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 76 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 77 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 78 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 79 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 80 }
Chris@42 81 Ta = VADD(T6, T9);
Chris@42 82 Ts = VSUB(T9, T6);
Chris@42 83 TW = VSUB(TJ, TI);
Chris@42 84 TK = VADD(TI, TJ);
Chris@42 85 TO = VADD(Ti, Tj);
Chris@42 86 Tk = VSUB(Ti, Tj);
Chris@42 87 TM = VADD(Te, Tf);
Chris@42 88 Tg = VSUB(Te, Tf);
Chris@42 89 TL = VADD(Tb, Tc);
Chris@42 90 Td = VSUB(Tb, Tc);
Chris@42 91 }
Chris@42 92 {
Chris@42 93 V T18, TB, T13, TY, TG, Tw, T11, Tr, T16, TT, Tz, TE, TU, TQ;
Chris@42 94 TU = VSUB(TO, TP);
Chris@42 95 TQ = VADD(TO, TP);
Chris@42 96 {
Chris@42 97 V Tt, To, TV, TN;
Chris@42 98 Tt = VSUB(Tn, Tk);
Chris@42 99 To = VADD(Tk, Tn);
Chris@42 100 TV = VSUB(TL, TM);
Chris@42 101 TN = VADD(TL, TM);
Chris@42 102 {
Chris@42 103 V Tu, Th, TZ, T17;
Chris@42 104 Tu = VSUB(Tg, Td);
Chris@42 105 Th = VADD(Td, Tg);
Chris@42 106 TZ = VFNMS(LDK(KP356895867), TK, TQ);
Chris@42 107 T17 = VFNMS(LDK(KP554958132), TU, TW);
Chris@42 108 {
Chris@42 109 V Tp, TA, T14, TR;
Chris@42 110 Tp = VFNMS(LDK(KP356895867), Ta, To);
Chris@42 111 TA = VFMA(LDK(KP554958132), Tt, Ts);
Chris@42 112 ST(&(xo[0]), VADD(TH, VADD(TK, VADD(TN, TQ))), ovs, &(xo[0]));
Chris@42 113 T14 = VFNMS(LDK(KP356895867), TN, TK);
Chris@42 114 TR = VFNMS(LDK(KP356895867), TQ, TN);
Chris@42 115 {
Chris@42 116 V T12, TX, Tx, TC;
Chris@42 117 T12 = VFMA(LDK(KP554958132), TV, TU);
Chris@42 118 TX = VFMA(LDK(KP554958132), TW, TV);
Chris@42 119 ST(&(xo[WS(os, 7)]), VADD(T3, VADD(Ta, VADD(Th, To))), ovs, &(xo[WS(os, 1)]));
Chris@42 120 Tx = VFNMS(LDK(KP356895867), Th, Ta);
Chris@42 121 TC = VFNMS(LDK(KP356895867), To, Th);
Chris@42 122 {
Chris@42 123 V TF, Tv, T10, Tq;
Chris@42 124 TF = VFNMS(LDK(KP554958132), Ts, Tu);
Chris@42 125 Tv = VFMA(LDK(KP554958132), Tu, Tt);
Chris@42 126 T10 = VFNMS(LDK(KP692021471), TZ, TN);
Chris@42 127 T18 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T17, TV));
Chris@42 128 Tq = VFNMS(LDK(KP692021471), Tp, Th);
Chris@42 129 TB = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TA, Tu));
Chris@42 130 {
Chris@42 131 V T15, TS, Ty, TD;
Chris@42 132 T15 = VFNMS(LDK(KP692021471), T14, TQ);
Chris@42 133 TS = VFNMS(LDK(KP692021471), TR, TK);
Chris@42 134 T13 = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), T12, TW));
Chris@42 135 TY = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TX, TU));
Chris@42 136 Ty = VFNMS(LDK(KP692021471), Tx, To);
Chris@42 137 TD = VFNMS(LDK(KP692021471), TC, Ta);
Chris@42 138 TG = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TF, Tt));
Chris@42 139 Tw = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tv, Ts));
Chris@42 140 T11 = VFNMS(LDK(KP900968867), T10, TH);
Chris@42 141 Tr = VFNMS(LDK(KP900968867), Tq, T3);
Chris@42 142 T16 = VFNMS(LDK(KP900968867), T15, TH);
Chris@42 143 TT = VFNMS(LDK(KP900968867), TS, TH);
Chris@42 144 Tz = VFNMS(LDK(KP900968867), Ty, T3);
Chris@42 145 TE = VFNMS(LDK(KP900968867), TD, T3);
Chris@42 146 }
Chris@42 147 }
Chris@42 148 }
Chris@42 149 }
Chris@42 150 }
Chris@42 151 }
Chris@42 152 ST(&(xo[WS(os, 12)]), VFNMSI(T13, T11), ovs, &(xo[0]));
Chris@42 153 ST(&(xo[WS(os, 2)]), VFMAI(T13, T11), ovs, &(xo[0]));
Chris@42 154 ST(&(xo[WS(os, 9)]), VFMAI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
Chris@42 155 ST(&(xo[WS(os, 5)]), VFNMSI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
Chris@42 156 ST(&(xo[WS(os, 8)]), VFNMSI(T18, T16), ovs, &(xo[0]));
Chris@42 157 ST(&(xo[WS(os, 6)]), VFMAI(T18, T16), ovs, &(xo[0]));
Chris@42 158 ST(&(xo[WS(os, 10)]), VFNMSI(TY, TT), ovs, &(xo[0]));
Chris@42 159 ST(&(xo[WS(os, 4)]), VFMAI(TY, TT), ovs, &(xo[0]));
Chris@42 160 ST(&(xo[WS(os, 1)]), VFMAI(TB, Tz), ovs, &(xo[WS(os, 1)]));
Chris@42 161 ST(&(xo[WS(os, 13)]), VFNMSI(TB, Tz), ovs, &(xo[WS(os, 1)]));
Chris@42 162 ST(&(xo[WS(os, 3)]), VFMAI(TG, TE), ovs, &(xo[WS(os, 1)]));
Chris@42 163 ST(&(xo[WS(os, 11)]), VFNMSI(TG, TE), ovs, &(xo[WS(os, 1)]));
Chris@42 164 }
Chris@42 165 }
Chris@42 166 }
Chris@42 167 VLEAVE();
Chris@42 168 }
Chris@42 169
Chris@42 170 static const kdft_desc desc = { 14, XSIMD_STRING("n1fv_14"), {32, 6, 42, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 171
Chris@42 172 void XSIMD(codelet_n1fv_14) (planner *p) {
Chris@42 173 X(kdft_register) (p, n1fv_14, &desc);
Chris@42 174 }
Chris@42 175
Chris@42 176 #else /* HAVE_FMA */
Chris@42 177
Chris@42 178 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 14 -name n1fv_14 -include n1f.h */
Chris@42 179
Chris@42 180 /*
Chris@42 181 * This function contains 74 FP additions, 36 FP multiplications,
Chris@42 182 * (or, 50 additions, 12 multiplications, 24 fused multiply/add),
Chris@42 183 * 33 stack variables, 6 constants, and 28 memory accesses
Chris@42 184 */
Chris@42 185 #include "n1f.h"
Chris@42 186
Chris@42 187 static void n1fv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 188 {
Chris@42 189 DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@42 190 DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 191 DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@42 192 DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@42 193 DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@42 194 DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 195 {
Chris@42 196 INT i;
Chris@42 197 const R *xi;
Chris@42 198 R *xo;
Chris@42 199 xi = ri;
Chris@42 200 xo = ro;
Chris@42 201 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
Chris@42 202 V T3, Ty, To, TK, Tr, TE, Ta, TJ, Tq, TB, Th, TL, Ts, TH, T1;
Chris@42 203 V T2;
Chris@42 204 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 205 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 206 T3 = VSUB(T1, T2);
Chris@42 207 Ty = VADD(T1, T2);
Chris@42 208 {
Chris@42 209 V Tk, TC, Tn, TD;
Chris@42 210 {
Chris@42 211 V Ti, Tj, Tl, Tm;
Chris@42 212 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 213 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 214 Tk = VSUB(Ti, Tj);
Chris@42 215 TC = VADD(Ti, Tj);
Chris@42 216 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 217 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 218 Tn = VSUB(Tl, Tm);
Chris@42 219 TD = VADD(Tl, Tm);
Chris@42 220 }
Chris@42 221 To = VADD(Tk, Tn);
Chris@42 222 TK = VSUB(TC, TD);
Chris@42 223 Tr = VSUB(Tn, Tk);
Chris@42 224 TE = VADD(TC, TD);
Chris@42 225 }
Chris@42 226 {
Chris@42 227 V T6, Tz, T9, TA;
Chris@42 228 {
Chris@42 229 V T4, T5, T7, T8;
Chris@42 230 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 231 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 232 T6 = VSUB(T4, T5);
Chris@42 233 Tz = VADD(T4, T5);
Chris@42 234 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 235 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 236 T9 = VSUB(T7, T8);
Chris@42 237 TA = VADD(T7, T8);
Chris@42 238 }
Chris@42 239 Ta = VADD(T6, T9);
Chris@42 240 TJ = VSUB(TA, Tz);
Chris@42 241 Tq = VSUB(T9, T6);
Chris@42 242 TB = VADD(Tz, TA);
Chris@42 243 }
Chris@42 244 {
Chris@42 245 V Td, TF, Tg, TG;
Chris@42 246 {
Chris@42 247 V Tb, Tc, Te, Tf;
Chris@42 248 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 249 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 250 Td = VSUB(Tb, Tc);
Chris@42 251 TF = VADD(Tb, Tc);
Chris@42 252 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 253 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 254 Tg = VSUB(Te, Tf);
Chris@42 255 TG = VADD(Te, Tf);
Chris@42 256 }
Chris@42 257 Th = VADD(Td, Tg);
Chris@42 258 TL = VSUB(TF, TG);
Chris@42 259 Ts = VSUB(Tg, Td);
Chris@42 260 TH = VADD(TF, TG);
Chris@42 261 }
Chris@42 262 ST(&(xo[WS(os, 7)]), VADD(T3, VADD(Ta, VADD(Th, To))), ovs, &(xo[WS(os, 1)]));
Chris@42 263 ST(&(xo[0]), VADD(Ty, VADD(TB, VADD(TH, TE))), ovs, &(xo[0]));
Chris@42 264 {
Chris@42 265 V Tt, Tp, TP, TQ;
Chris@42 266 Tt = VBYI(VFNMS(LDK(KP781831482), Tr, VFNMS(LDK(KP433883739), Ts, VMUL(LDK(KP974927912), Tq))));
Chris@42 267 Tp = VFMA(LDK(KP623489801), To, VFNMS(LDK(KP900968867), Th, VFNMS(LDK(KP222520933), Ta, T3)));
Chris@42 268 ST(&(xo[WS(os, 5)]), VSUB(Tp, Tt), ovs, &(xo[WS(os, 1)]));
Chris@42 269 ST(&(xo[WS(os, 9)]), VADD(Tp, Tt), ovs, &(xo[WS(os, 1)]));
Chris@42 270 TP = VBYI(VFMA(LDK(KP974927912), TJ, VFMA(LDK(KP433883739), TL, VMUL(LDK(KP781831482), TK))));
Chris@42 271 TQ = VFMA(LDK(KP623489801), TE, VFNMS(LDK(KP900968867), TH, VFNMS(LDK(KP222520933), TB, Ty)));
Chris@42 272 ST(&(xo[WS(os, 2)]), VADD(TP, TQ), ovs, &(xo[0]));
Chris@42 273 ST(&(xo[WS(os, 12)]), VSUB(TQ, TP), ovs, &(xo[0]));
Chris@42 274 }
Chris@42 275 {
Chris@42 276 V Tv, Tu, TM, TI;
Chris@42 277 Tv = VBYI(VFMA(LDK(KP781831482), Tq, VFMA(LDK(KP974927912), Ts, VMUL(LDK(KP433883739), Tr))));
Chris@42 278 Tu = VFMA(LDK(KP623489801), Ta, VFNMS(LDK(KP900968867), To, VFNMS(LDK(KP222520933), Th, T3)));
Chris@42 279 ST(&(xo[WS(os, 13)]), VSUB(Tu, Tv), ovs, &(xo[WS(os, 1)]));
Chris@42 280 ST(&(xo[WS(os, 1)]), VADD(Tu, Tv), ovs, &(xo[WS(os, 1)]));
Chris@42 281 TM = VBYI(VFNMS(LDK(KP433883739), TK, VFNMS(LDK(KP974927912), TL, VMUL(LDK(KP781831482), TJ))));
Chris@42 282 TI = VFMA(LDK(KP623489801), TB, VFNMS(LDK(KP900968867), TE, VFNMS(LDK(KP222520933), TH, Ty)));
Chris@42 283 ST(&(xo[WS(os, 6)]), VSUB(TI, TM), ovs, &(xo[0]));
Chris@42 284 ST(&(xo[WS(os, 8)]), VADD(TM, TI), ovs, &(xo[0]));
Chris@42 285 }
Chris@42 286 {
Chris@42 287 V TO, TN, Tx, Tw;
Chris@42 288 TO = VBYI(VFMA(LDK(KP433883739), TJ, VFNMS(LDK(KP974927912), TK, VMUL(LDK(KP781831482), TL))));
Chris@42 289 TN = VFMA(LDK(KP623489801), TH, VFNMS(LDK(KP222520933), TE, VFNMS(LDK(KP900968867), TB, Ty)));
Chris@42 290 ST(&(xo[WS(os, 4)]), VSUB(TN, TO), ovs, &(xo[0]));
Chris@42 291 ST(&(xo[WS(os, 10)]), VADD(TO, TN), ovs, &(xo[0]));
Chris@42 292 Tx = VBYI(VFMA(LDK(KP433883739), Tq, VFNMS(LDK(KP781831482), Ts, VMUL(LDK(KP974927912), Tr))));
Chris@42 293 Tw = VFMA(LDK(KP623489801), Th, VFNMS(LDK(KP222520933), To, VFNMS(LDK(KP900968867), Ta, T3)));
Chris@42 294 ST(&(xo[WS(os, 11)]), VSUB(Tw, Tx), ovs, &(xo[WS(os, 1)]));
Chris@42 295 ST(&(xo[WS(os, 3)]), VADD(Tw, Tx), ovs, &(xo[WS(os, 1)]));
Chris@42 296 }
Chris@42 297 }
Chris@42 298 }
Chris@42 299 VLEAVE();
Chris@42 300 }
Chris@42 301
Chris@42 302 static const kdft_desc desc = { 14, XSIMD_STRING("n1fv_14"), {50, 12, 24, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 303
Chris@42 304 void XSIMD(codelet_n1fv_14) (planner *p) {
Chris@42 305 X(kdft_register) (p, n1fv_14, &desc);
Chris@42 306 }
Chris@42 307
Chris@42 308 #endif /* HAVE_FMA */