annotate src/fftw-3.3.3/dft/simd/common/n1bv_14.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:37:02 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 14 -name n1bv_14 -include n1b.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 74 FP additions, 48 FP multiplications,
Chris@10 32 * (or, 32 additions, 6 multiplications, 42 fused multiply/add),
Chris@10 33 * 63 stack variables, 6 constants, and 28 memory accesses
Chris@10 34 */
Chris@10 35 #include "n1b.h"
Chris@10 36
Chris@10 37 static void n1bv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@10 40 DVK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@10 41 DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@10 42 DVK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@10 43 DVK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@10 44 DVK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@10 45 {
Chris@10 46 INT i;
Chris@10 47 const R *xi;
Chris@10 48 R *xo;
Chris@10 49 xi = ii;
Chris@10 50 xo = io;
Chris@10 51 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
Chris@10 52 V TH, T3, TP, Tn, Ta, Tu, TU, TK, TO, Tk, TM, Tg, TL, Td, T1;
Chris@10 53 V T2;
Chris@10 54 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 55 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 56 {
Chris@10 57 V Ti, TI, T6, TJ, T9, Tj, Te, Tf, Tb, Tc;
Chris@10 58 {
Chris@10 59 V T4, T5, T7, T8, Tl, Tm;
Chris@10 60 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 61 T5 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 62 T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 63 T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 64 Tl = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 65 Tm = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 66 Ti = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 67 TH = VADD(T1, T2);
Chris@10 68 T3 = VSUB(T1, T2);
Chris@10 69 TI = VADD(T4, T5);
Chris@10 70 T6 = VSUB(T4, T5);
Chris@10 71 TJ = VADD(T7, T8);
Chris@10 72 T9 = VSUB(T7, T8);
Chris@10 73 TP = VADD(Tl, Tm);
Chris@10 74 Tn = VSUB(Tl, Tm);
Chris@10 75 Tj = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 76 Te = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 77 Tf = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 78 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 79 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 80 }
Chris@10 81 Ta = VADD(T6, T9);
Chris@10 82 Tu = VSUB(T6, T9);
Chris@10 83 TU = VSUB(TI, TJ);
Chris@10 84 TK = VADD(TI, TJ);
Chris@10 85 TO = VADD(Ti, Tj);
Chris@10 86 Tk = VSUB(Ti, Tj);
Chris@10 87 TM = VADD(Te, Tf);
Chris@10 88 Tg = VSUB(Te, Tf);
Chris@10 89 TL = VADD(Tb, Tc);
Chris@10 90 Td = VSUB(Tb, Tc);
Chris@10 91 }
Chris@10 92 {
Chris@10 93 V T13, TG, TY, T18, TB, Tw, TT, Tz, T11, T16, TE, Tr, TV, TQ;
Chris@10 94 TV = VSUB(TP, TO);
Chris@10 95 TQ = VADD(TO, TP);
Chris@10 96 {
Chris@10 97 V Ts, To, TW, TN;
Chris@10 98 Ts = VSUB(Tk, Tn);
Chris@10 99 To = VADD(Tk, Tn);
Chris@10 100 TW = VSUB(TM, TL);
Chris@10 101 TN = VADD(TL, TM);
Chris@10 102 {
Chris@10 103 V Tt, Th, TR, T12;
Chris@10 104 Tt = VSUB(Td, Tg);
Chris@10 105 Th = VADD(Td, Tg);
Chris@10 106 TR = VFNMS(LDK(KP356895867), TK, TQ);
Chris@10 107 T12 = VFNMS(LDK(KP554958132), TV, TU);
Chris@10 108 {
Chris@10 109 V Tx, TF, TZ, T14;
Chris@10 110 Tx = VFNMS(LDK(KP356895867), Ta, To);
Chris@10 111 TF = VFMA(LDK(KP554958132), Ts, Tu);
Chris@10 112 ST(&(xo[0]), VADD(TH, VADD(TK, VADD(TN, TQ))), ovs, &(xo[0]));
Chris@10 113 TZ = VFNMS(LDK(KP356895867), TN, TK);
Chris@10 114 T14 = VFNMS(LDK(KP356895867), TQ, TN);
Chris@10 115 {
Chris@10 116 V TX, T17, TC, Tp;
Chris@10 117 TX = VFMA(LDK(KP554958132), TW, TV);
Chris@10 118 T17 = VFMA(LDK(KP554958132), TU, TW);
Chris@10 119 ST(&(xo[WS(os, 7)]), VADD(T3, VADD(Ta, VADD(Th, To))), ovs, &(xo[WS(os, 1)]));
Chris@10 120 TC = VFNMS(LDK(KP356895867), Th, Ta);
Chris@10 121 Tp = VFNMS(LDK(KP356895867), To, Th);
Chris@10 122 {
Chris@10 123 V TA, Tv, TS, Ty;
Chris@10 124 TA = VFMA(LDK(KP554958132), Tt, Ts);
Chris@10 125 Tv = VFNMS(LDK(KP554958132), Tu, Tt);
Chris@10 126 TS = VFNMS(LDK(KP692021471), TR, TN);
Chris@10 127 T13 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T12, TW));
Chris@10 128 Ty = VFNMS(LDK(KP692021471), Tx, Th);
Chris@10 129 TG = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TF, Tt));
Chris@10 130 {
Chris@10 131 V T10, T15, TD, Tq;
Chris@10 132 T10 = VFNMS(LDK(KP692021471), TZ, TQ);
Chris@10 133 T15 = VFNMS(LDK(KP692021471), T14, TK);
Chris@10 134 TY = VMUL(LDK(KP974927912), VFMA(LDK(KP801937735), TX, TU));
Chris@10 135 T18 = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), T17, TV));
Chris@10 136 TD = VFNMS(LDK(KP692021471), TC, To);
Chris@10 137 Tq = VFNMS(LDK(KP692021471), Tp, Ta);
Chris@10 138 TB = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), TA, Tu));
Chris@10 139 Tw = VMUL(LDK(KP974927912), VFNMS(LDK(KP801937735), Tv, Ts));
Chris@10 140 TT = VFNMS(LDK(KP900968867), TS, TH);
Chris@10 141 Tz = VFNMS(LDK(KP900968867), Ty, T3);
Chris@10 142 T11 = VFNMS(LDK(KP900968867), T10, TH);
Chris@10 143 T16 = VFNMS(LDK(KP900968867), T15, TH);
Chris@10 144 TE = VFNMS(LDK(KP900968867), TD, T3);
Chris@10 145 Tr = VFNMS(LDK(KP900968867), Tq, T3);
Chris@10 146 }
Chris@10 147 }
Chris@10 148 }
Chris@10 149 }
Chris@10 150 }
Chris@10 151 }
Chris@10 152 ST(&(xo[WS(os, 2)]), VFMAI(TY, TT), ovs, &(xo[0]));
Chris@10 153 ST(&(xo[WS(os, 12)]), VFNMSI(TY, TT), ovs, &(xo[0]));
Chris@10 154 ST(&(xo[WS(os, 9)]), VFMAI(TB, Tz), ovs, &(xo[WS(os, 1)]));
Chris@10 155 ST(&(xo[WS(os, 5)]), VFNMSI(TB, Tz), ovs, &(xo[WS(os, 1)]));
Chris@10 156 ST(&(xo[WS(os, 6)]), VFMAI(T13, T11), ovs, &(xo[0]));
Chris@10 157 ST(&(xo[WS(os, 8)]), VFNMSI(T13, T11), ovs, &(xo[0]));
Chris@10 158 ST(&(xo[WS(os, 4)]), VFMAI(T18, T16), ovs, &(xo[0]));
Chris@10 159 ST(&(xo[WS(os, 10)]), VFNMSI(T18, T16), ovs, &(xo[0]));
Chris@10 160 ST(&(xo[WS(os, 13)]), VFNMSI(TG, TE), ovs, &(xo[WS(os, 1)]));
Chris@10 161 ST(&(xo[WS(os, 1)]), VFMAI(TG, TE), ovs, &(xo[WS(os, 1)]));
Chris@10 162 ST(&(xo[WS(os, 11)]), VFNMSI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
Chris@10 163 ST(&(xo[WS(os, 3)]), VFMAI(Tw, Tr), ovs, &(xo[WS(os, 1)]));
Chris@10 164 }
Chris@10 165 }
Chris@10 166 }
Chris@10 167 VLEAVE();
Chris@10 168 }
Chris@10 169
Chris@10 170 static const kdft_desc desc = { 14, XSIMD_STRING("n1bv_14"), {32, 6, 42, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 171
Chris@10 172 void XSIMD(codelet_n1bv_14) (planner *p) {
Chris@10 173 X(kdft_register) (p, n1bv_14, &desc);
Chris@10 174 }
Chris@10 175
Chris@10 176 #else /* HAVE_FMA */
Chris@10 177
Chris@10 178 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 14 -name n1bv_14 -include n1b.h */
Chris@10 179
Chris@10 180 /*
Chris@10 181 * This function contains 74 FP additions, 36 FP multiplications,
Chris@10 182 * (or, 50 additions, 12 multiplications, 24 fused multiply/add),
Chris@10 183 * 33 stack variables, 6 constants, and 28 memory accesses
Chris@10 184 */
Chris@10 185 #include "n1b.h"
Chris@10 186
Chris@10 187 static void n1bv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 188 {
Chris@10 189 DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@10 190 DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@10 191 DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@10 192 DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@10 193 DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@10 194 DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@10 195 {
Chris@10 196 INT i;
Chris@10 197 const R *xi;
Chris@10 198 R *xo;
Chris@10 199 xi = ii;
Chris@10 200 xo = io;
Chris@10 201 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
Chris@10 202 V Tp, Ty, Tl, TL, Tq, TE, T7, TJ, Ts, TB, Te, TK, Tr, TH, Tn;
Chris@10 203 V To;
Chris@10 204 Tn = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 205 To = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 206 Tp = VSUB(Tn, To);
Chris@10 207 Ty = VADD(Tn, To);
Chris@10 208 {
Chris@10 209 V Th, TC, Tk, TD;
Chris@10 210 {
Chris@10 211 V Tf, Tg, Ti, Tj;
Chris@10 212 Tf = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 213 Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 214 Th = VSUB(Tf, Tg);
Chris@10 215 TC = VADD(Tf, Tg);
Chris@10 216 Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 217 Tj = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 218 Tk = VSUB(Ti, Tj);
Chris@10 219 TD = VADD(Ti, Tj);
Chris@10 220 }
Chris@10 221 Tl = VSUB(Th, Tk);
Chris@10 222 TL = VSUB(TD, TC);
Chris@10 223 Tq = VADD(Th, Tk);
Chris@10 224 TE = VADD(TC, TD);
Chris@10 225 }
Chris@10 226 {
Chris@10 227 V T3, Tz, T6, TA;
Chris@10 228 {
Chris@10 229 V T1, T2, T4, T5;
Chris@10 230 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 231 T2 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 232 T3 = VSUB(T1, T2);
Chris@10 233 Tz = VADD(T1, T2);
Chris@10 234 T4 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 235 T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 236 T6 = VSUB(T4, T5);
Chris@10 237 TA = VADD(T4, T5);
Chris@10 238 }
Chris@10 239 T7 = VSUB(T3, T6);
Chris@10 240 TJ = VSUB(Tz, TA);
Chris@10 241 Ts = VADD(T3, T6);
Chris@10 242 TB = VADD(Tz, TA);
Chris@10 243 }
Chris@10 244 {
Chris@10 245 V Ta, TF, Td, TG;
Chris@10 246 {
Chris@10 247 V T8, T9, Tb, Tc;
Chris@10 248 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 249 T9 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 250 Ta = VSUB(T8, T9);
Chris@10 251 TF = VADD(T8, T9);
Chris@10 252 Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 253 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 254 Td = VSUB(Tb, Tc);
Chris@10 255 TG = VADD(Tb, Tc);
Chris@10 256 }
Chris@10 257 Te = VSUB(Ta, Td);
Chris@10 258 TK = VSUB(TG, TF);
Chris@10 259 Tr = VADD(Ta, Td);
Chris@10 260 TH = VADD(TF, TG);
Chris@10 261 }
Chris@10 262 ST(&(xo[WS(os, 7)]), VADD(Tp, VADD(Ts, VADD(Tq, Tr))), ovs, &(xo[WS(os, 1)]));
Chris@10 263 ST(&(xo[0]), VADD(Ty, VADD(TB, VADD(TE, TH))), ovs, &(xo[0]));
Chris@10 264 {
Chris@10 265 V Tm, Tt, TQ, TP;
Chris@10 266 Tm = VBYI(VFMA(LDK(KP433883739), T7, VFNMS(LDK(KP781831482), Tl, VMUL(LDK(KP974927912), Te))));
Chris@10 267 Tt = VFMA(LDK(KP623489801), Tq, VFNMS(LDK(KP222520933), Tr, VFNMS(LDK(KP900968867), Ts, Tp)));
Chris@10 268 ST(&(xo[WS(os, 3)]), VADD(Tm, Tt), ovs, &(xo[WS(os, 1)]));
Chris@10 269 ST(&(xo[WS(os, 11)]), VSUB(Tt, Tm), ovs, &(xo[WS(os, 1)]));
Chris@10 270 TQ = VBYI(VFMA(LDK(KP974927912), TJ, VFMA(LDK(KP433883739), TL, VMUL(LDK(KP781831482), TK))));
Chris@10 271 TP = VFMA(LDK(KP623489801), TH, VFNMS(LDK(KP900968867), TE, VFNMS(LDK(KP222520933), TB, Ty)));
Chris@10 272 ST(&(xo[WS(os, 12)]), VSUB(TP, TQ), ovs, &(xo[0]));
Chris@10 273 ST(&(xo[WS(os, 2)]), VADD(TP, TQ), ovs, &(xo[0]));
Chris@10 274 }
Chris@10 275 {
Chris@10 276 V Tu, Tv, TM, TI;
Chris@10 277 Tu = VBYI(VFMA(LDK(KP781831482), T7, VFMA(LDK(KP974927912), Tl, VMUL(LDK(KP433883739), Te))));
Chris@10 278 Tv = VFMA(LDK(KP623489801), Ts, VFNMS(LDK(KP900968867), Tr, VFNMS(LDK(KP222520933), Tq, Tp)));
Chris@10 279 ST(&(xo[WS(os, 1)]), VADD(Tu, Tv), ovs, &(xo[WS(os, 1)]));
Chris@10 280 ST(&(xo[WS(os, 13)]), VSUB(Tv, Tu), ovs, &(xo[WS(os, 1)]));
Chris@10 281 TM = VBYI(VFNMS(LDK(KP433883739), TK, VFNMS(LDK(KP974927912), TL, VMUL(LDK(KP781831482), TJ))));
Chris@10 282 TI = VFMA(LDK(KP623489801), TB, VFNMS(LDK(KP900968867), TH, VFNMS(LDK(KP222520933), TE, Ty)));
Chris@10 283 ST(&(xo[WS(os, 6)]), VSUB(TI, TM), ovs, &(xo[0]));
Chris@10 284 ST(&(xo[WS(os, 8)]), VADD(TI, TM), ovs, &(xo[0]));
Chris@10 285 }
Chris@10 286 {
Chris@10 287 V TO, TN, Tx, Tw;
Chris@10 288 TO = VBYI(VFMA(LDK(KP433883739), TJ, VFNMS(LDK(KP974927912), TK, VMUL(LDK(KP781831482), TL))));
Chris@10 289 TN = VFMA(LDK(KP623489801), TE, VFNMS(LDK(KP222520933), TH, VFNMS(LDK(KP900968867), TB, Ty)));
Chris@10 290 ST(&(xo[WS(os, 4)]), VSUB(TN, TO), ovs, &(xo[0]));
Chris@10 291 ST(&(xo[WS(os, 10)]), VADD(TN, TO), ovs, &(xo[0]));
Chris@10 292 Tx = VBYI(VFNMS(LDK(KP781831482), Te, VFNMS(LDK(KP433883739), Tl, VMUL(LDK(KP974927912), T7))));
Chris@10 293 Tw = VFMA(LDK(KP623489801), Tr, VFNMS(LDK(KP900968867), Tq, VFNMS(LDK(KP222520933), Ts, Tp)));
Chris@10 294 ST(&(xo[WS(os, 5)]), VSUB(Tw, Tx), ovs, &(xo[WS(os, 1)]));
Chris@10 295 ST(&(xo[WS(os, 9)]), VADD(Tx, Tw), ovs, &(xo[WS(os, 1)]));
Chris@10 296 }
Chris@10 297 }
Chris@10 298 }
Chris@10 299 VLEAVE();
Chris@10 300 }
Chris@10 301
Chris@10 302 static const kdft_desc desc = { 14, XSIMD_STRING("n1bv_14"), {50, 12, 24, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 303
Chris@10 304 void XSIMD(codelet_n1bv_14) (planner *p) {
Chris@10 305 X(kdft_register) (p, n1bv_14, &desc);
Chris@10 306 }
Chris@10 307
Chris@10 308 #endif /* HAVE_FMA */