annotate src/fftw-3.3.8/dft/simd/common/n1bv_11.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:56 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 11 -name n1bv_11 -include dft/simd/n1b.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 70 FP additions, 60 FP multiplications,
Chris@82 32 * (or, 15 additions, 5 multiplications, 55 fused multiply/add),
Chris@82 33 * 42 stack variables, 11 constants, and 22 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1b.h"
Chris@82 36
Chris@82 37 static void n1bv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
Chris@82 40 DVK(KP918985947, +0.918985947228994779780736114132655398124909697);
Chris@82 41 DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
Chris@82 42 DVK(KP830830026, +0.830830026003772851058548298459246407048009821);
Chris@82 43 DVK(KP876768831, +0.876768831002589333891339807079336796764054852);
Chris@82 44 DVK(KP778434453, +0.778434453334651800608337670740821884709317477);
Chris@82 45 DVK(KP372785597, +0.372785597771792209609773152906148328659002598);
Chris@82 46 DVK(KP715370323, +0.715370323453429719112414662767260662417897278);
Chris@82 47 DVK(KP521108558, +0.521108558113202722944698153526659300680427422);
Chris@82 48 DVK(KP634356270, +0.634356270682424498893150776899916060542806975);
Chris@82 49 DVK(KP342584725, +0.342584725681637509502641509861112333758894680);
Chris@82 50 {
Chris@82 51 INT i;
Chris@82 52 const R *xi;
Chris@82 53 R *xo;
Chris@82 54 xi = ii;
Chris@82 55 xo = io;
Chris@82 56 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
Chris@82 57 V T1, T4, Tq, Tg, Tm, T7, Tp, Ta, To, Td, Tn, Ti, Tw, T12, Ts;
Chris@82 58 V TX, TT, TK, TB, TO, TF, T5, T6;
Chris@82 59 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 60 {
Chris@82 61 V T2, T3, Te, Tf;
Chris@82 62 T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 63 T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 64 T4 = VADD(T2, T3);
Chris@82 65 Tq = VSUB(T2, T3);
Chris@82 66 Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 67 Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 68 Tg = VADD(Te, Tf);
Chris@82 69 Tm = VSUB(Te, Tf);
Chris@82 70 }
Chris@82 71 T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 72 T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 73 T7 = VADD(T5, T6);
Chris@82 74 Tp = VSUB(T5, T6);
Chris@82 75 {
Chris@82 76 V T8, T9, Tb, Tc;
Chris@82 77 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 78 T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 79 Ta = VADD(T8, T9);
Chris@82 80 To = VSUB(T8, T9);
Chris@82 81 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 82 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 83 Td = VADD(Tb, Tc);
Chris@82 84 Tn = VSUB(Tb, Tc);
Chris@82 85 }
Chris@82 86 {
Chris@82 87 V Th, Tv, T11, Tr, TW;
Chris@82 88 Th = VFNMS(LDK(KP342584725), Tg, Td);
Chris@82 89 Ti = VFNMS(LDK(KP634356270), Th, Ta);
Chris@82 90 Tv = VFNMS(LDK(KP342584725), T7, Tg);
Chris@82 91 Tw = VFNMS(LDK(KP634356270), Tv, T4);
Chris@82 92 T11 = VFMA(LDK(KP521108558), Tm, Tq);
Chris@82 93 T12 = VFMA(LDK(KP715370323), T11, Tn);
Chris@82 94 Tr = VFNMS(LDK(KP521108558), Tq, Tp);
Chris@82 95 Ts = VFNMS(LDK(KP715370323), Tr, To);
Chris@82 96 TW = VFNMS(LDK(KP342584725), Ta, T7);
Chris@82 97 TX = VFNMS(LDK(KP634356270), TW, Td);
Chris@82 98 }
Chris@82 99 {
Chris@82 100 V TS, TJ, TA, TN, TE;
Chris@82 101 TS = VFMA(LDK(KP521108558), To, Tm);
Chris@82 102 TT = VFNMS(LDK(KP715370323), TS, Tp);
Chris@82 103 TJ = VFNMS(LDK(KP521108558), Tp, Tn);
Chris@82 104 TK = VFMA(LDK(KP715370323), TJ, Tm);
Chris@82 105 TA = VFMA(LDK(KP715370323), To, Tq);
Chris@82 106 TB = VFMA(LDK(KP372785597), Tn, TA);
Chris@82 107 TN = VFNMS(LDK(KP342584725), Td, T4);
Chris@82 108 TO = VFNMS(LDK(KP634356270), TN, T7);
Chris@82 109 TE = VFNMS(LDK(KP342584725), T4, Ta);
Chris@82 110 TF = VFNMS(LDK(KP634356270), TE, Tg);
Chris@82 111 }
Chris@82 112 ST(&(xo[0]), VADD(Tg, VADD(Td, VADD(Ta, VADD(T7, VADD(T4, T1))))), ovs, &(xo[0]));
Chris@82 113 {
Chris@82 114 V Tk, Tu, Tj, Tt, Tl;
Chris@82 115 Tj = VFNMS(LDK(KP778434453), Ti, T7);
Chris@82 116 Tk = VFNMS(LDK(KP876768831), Tj, T4);
Chris@82 117 Tt = VFNMS(LDK(KP830830026), Ts, Tn);
Chris@82 118 Tu = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), Tt, Tm));
Chris@82 119 Tl = VFNMS(LDK(KP959492973), Tk, T1);
Chris@82 120 ST(&(xo[WS(os, 5)]), VFMAI(Tu, Tl), ovs, &(xo[WS(os, 1)]));
Chris@82 121 ST(&(xo[WS(os, 6)]), VFNMSI(Tu, Tl), ovs, &(xo[0]));
Chris@82 122 }
Chris@82 123 {
Chris@82 124 V TZ, T14, TY, T13, T10;
Chris@82 125 TY = VFNMS(LDK(KP778434453), TX, T4);
Chris@82 126 TZ = VFNMS(LDK(KP876768831), TY, Tg);
Chris@82 127 T13 = VFMA(LDK(KP830830026), T12, Tp);
Chris@82 128 T14 = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), T13, To));
Chris@82 129 T10 = VFNMS(LDK(KP959492973), TZ, T1);
Chris@82 130 ST(&(xo[WS(os, 1)]), VFMAI(T14, T10), ovs, &(xo[WS(os, 1)]));
Chris@82 131 ST(&(xo[WS(os, 10)]), VFNMSI(T14, T10), ovs, &(xo[0]));
Chris@82 132 }
Chris@82 133 {
Chris@82 134 V TQ, TV, TP, TU, TR;
Chris@82 135 TP = VFNMS(LDK(KP778434453), TO, Tg);
Chris@82 136 TQ = VFNMS(LDK(KP876768831), TP, Ta);
Chris@82 137 TU = VFMA(LDK(KP830830026), TT, Tq);
Chris@82 138 TV = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TU, Tn));
Chris@82 139 TR = VFNMS(LDK(KP959492973), TQ, T1);
Chris@82 140 ST(&(xo[WS(os, 2)]), VFNMSI(TV, TR), ovs, &(xo[0]));
Chris@82 141 ST(&(xo[WS(os, 9)]), VFMAI(TV, TR), ovs, &(xo[WS(os, 1)]));
Chris@82 142 }
Chris@82 143 {
Chris@82 144 V TH, TM, TG, TL, TI;
Chris@82 145 TG = VFNMS(LDK(KP778434453), TF, Td);
Chris@82 146 TH = VFNMS(LDK(KP876768831), TG, T7);
Chris@82 147 TL = VFNMS(LDK(KP830830026), TK, To);
Chris@82 148 TM = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TL, Tq));
Chris@82 149 TI = VFNMS(LDK(KP959492973), TH, T1);
Chris@82 150 ST(&(xo[WS(os, 3)]), VFMAI(TM, TI), ovs, &(xo[WS(os, 1)]));
Chris@82 151 ST(&(xo[WS(os, 8)]), VFNMSI(TM, TI), ovs, &(xo[0]));
Chris@82 152 }
Chris@82 153 {
Chris@82 154 V Ty, TD, Tx, TC, Tz;
Chris@82 155 Tx = VFNMS(LDK(KP778434453), Tw, Ta);
Chris@82 156 Ty = VFNMS(LDK(KP876768831), Tx, Td);
Chris@82 157 TC = VFNMS(LDK(KP830830026), TB, Tm);
Chris@82 158 TD = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), TC, Tp));
Chris@82 159 Tz = VFNMS(LDK(KP959492973), Ty, T1);
Chris@82 160 ST(&(xo[WS(os, 4)]), VFNMSI(TD, Tz), ovs, &(xo[0]));
Chris@82 161 ST(&(xo[WS(os, 7)]), VFMAI(TD, Tz), ovs, &(xo[WS(os, 1)]));
Chris@82 162 }
Chris@82 163 }
Chris@82 164 }
Chris@82 165 VLEAVE();
Chris@82 166 }
Chris@82 167
Chris@82 168 static const kdft_desc desc = { 11, XSIMD_STRING("n1bv_11"), {15, 5, 55, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 169
Chris@82 170 void XSIMD(codelet_n1bv_11) (planner *p) {
Chris@82 171 X(kdft_register) (p, n1bv_11, &desc);
Chris@82 172 }
Chris@82 173
Chris@82 174 #else
Chris@82 175
Chris@82 176 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 11 -name n1bv_11 -include dft/simd/n1b.h */
Chris@82 177
Chris@82 178 /*
Chris@82 179 * This function contains 70 FP additions, 50 FP multiplications,
Chris@82 180 * (or, 30 additions, 10 multiplications, 40 fused multiply/add),
Chris@82 181 * 32 stack variables, 10 constants, and 22 memory accesses
Chris@82 182 */
Chris@82 183 #include "dft/simd/n1b.h"
Chris@82 184
Chris@82 185 static void n1bv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 186 {
Chris@82 187 DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
Chris@82 188 DVK(KP654860733, +0.654860733945285064056925072466293553183791199);
Chris@82 189 DVK(KP142314838, +0.142314838273285140443792668616369668791051361);
Chris@82 190 DVK(KP415415013, +0.415415013001886425529274149229623203524004910);
Chris@82 191 DVK(KP841253532, +0.841253532831181168861811648919367717513292498);
Chris@82 192 DVK(KP540640817, +0.540640817455597582107635954318691695431770608);
Chris@82 193 DVK(KP909631995, +0.909631995354518371411715383079028460060241051);
Chris@82 194 DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
Chris@82 195 DVK(KP755749574, +0.755749574354258283774035843972344420179717445);
Chris@82 196 DVK(KP281732556, +0.281732556841429697711417915346616899035777899);
Chris@82 197 {
Chris@82 198 INT i;
Chris@82 199 const R *xi;
Chris@82 200 R *xo;
Chris@82 201 xi = ii;
Chris@82 202 xo = io;
Chris@82 203 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
Chris@82 204 V Th, T3, Tm, Tf, Ti, Tc, Tj, T9, Tk, T6, Tl, Ta, Tb, Ts, Tt;
Chris@82 205 Th = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 206 {
Chris@82 207 V T1, T2, Td, Te;
Chris@82 208 T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 209 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 210 T3 = VSUB(T1, T2);
Chris@82 211 Tm = VADD(T1, T2);
Chris@82 212 Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 213 Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 214 Tf = VSUB(Td, Te);
Chris@82 215 Ti = VADD(Td, Te);
Chris@82 216 }
Chris@82 217 Ta = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 218 Tb = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 219 Tc = VSUB(Ta, Tb);
Chris@82 220 Tj = VADD(Ta, Tb);
Chris@82 221 {
Chris@82 222 V T7, T8, T4, T5;
Chris@82 223 T7 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 224 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 225 T9 = VSUB(T7, T8);
Chris@82 226 Tk = VADD(T7, T8);
Chris@82 227 T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 228 T5 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 229 T6 = VSUB(T4, T5);
Chris@82 230 Tl = VADD(T4, T5);
Chris@82 231 }
Chris@82 232 ST(&(xo[0]), VADD(Th, VADD(Tm, VADD(Ti, VADD(Tl, VADD(Tj, Tk))))), ovs, &(xo[0]));
Chris@82 233 {
Chris@82 234 V Tg, Tn, Tu, Tv;
Chris@82 235 Tg = VBYI(VFMA(LDK(KP281732556), T3, VFMA(LDK(KP755749574), T6, VFNMS(LDK(KP909631995), Tc, VFNMS(LDK(KP540640817), Tf, VMUL(LDK(KP989821441), T9))))));
Chris@82 236 Tn = VFMA(LDK(KP841253532), Ti, VFMA(LDK(KP415415013), Tj, VFNMS(LDK(KP142314838), Tk, VFNMS(LDK(KP654860733), Tl, VFNMS(LDK(KP959492973), Tm, Th)))));
Chris@82 237 ST(&(xo[WS(os, 5)]), VADD(Tg, Tn), ovs, &(xo[WS(os, 1)]));
Chris@82 238 ST(&(xo[WS(os, 6)]), VSUB(Tn, Tg), ovs, &(xo[0]));
Chris@82 239 Tu = VBYI(VFMA(LDK(KP755749574), T3, VFMA(LDK(KP540640817), T6, VFNMS(LDK(KP909631995), T9, VFNMS(LDK(KP989821441), Tf, VMUL(LDK(KP281732556), Tc))))));
Chris@82 240 Tv = VFMA(LDK(KP841253532), Tl, VFMA(LDK(KP415415013), Tk, VFNMS(LDK(KP959492973), Tj, VFNMS(LDK(KP142314838), Ti, VFNMS(LDK(KP654860733), Tm, Th)))));
Chris@82 241 ST(&(xo[WS(os, 4)]), VADD(Tu, Tv), ovs, &(xo[0]));
Chris@82 242 ST(&(xo[WS(os, 7)]), VSUB(Tv, Tu), ovs, &(xo[WS(os, 1)]));
Chris@82 243 }
Chris@82 244 Ts = VBYI(VFMA(LDK(KP909631995), T3, VFNMS(LDK(KP540640817), T9, VFNMS(LDK(KP989821441), Tc, VFNMS(LDK(KP281732556), T6, VMUL(LDK(KP755749574), Tf))))));
Chris@82 245 Tt = VFMA(LDK(KP415415013), Tm, VFMA(LDK(KP841253532), Tk, VFNMS(LDK(KP142314838), Tj, VFNMS(LDK(KP959492973), Tl, VFNMS(LDK(KP654860733), Ti, Th)))));
Chris@82 246 ST(&(xo[WS(os, 2)]), VADD(Ts, Tt), ovs, &(xo[0]));
Chris@82 247 ST(&(xo[WS(os, 9)]), VSUB(Tt, Ts), ovs, &(xo[WS(os, 1)]));
Chris@82 248 {
Chris@82 249 V Tq, Tr, To, Tp;
Chris@82 250 Tq = VBYI(VFMA(LDK(KP540640817), T3, VFMA(LDK(KP909631995), Tf, VFMA(LDK(KP989821441), T6, VFMA(LDK(KP755749574), Tc, VMUL(LDK(KP281732556), T9))))));
Chris@82 251 Tr = VFMA(LDK(KP841253532), Tm, VFMA(LDK(KP415415013), Ti, VFNMS(LDK(KP959492973), Tk, VFNMS(LDK(KP654860733), Tj, VFNMS(LDK(KP142314838), Tl, Th)))));
Chris@82 252 ST(&(xo[WS(os, 1)]), VADD(Tq, Tr), ovs, &(xo[WS(os, 1)]));
Chris@82 253 ST(&(xo[WS(os, 10)]), VSUB(Tr, Tq), ovs, &(xo[0]));
Chris@82 254 To = VBYI(VFMA(LDK(KP989821441), T3, VFMA(LDK(KP540640817), Tc, VFNMS(LDK(KP909631995), T6, VFNMS(LDK(KP281732556), Tf, VMUL(LDK(KP755749574), T9))))));
Chris@82 255 Tp = VFMA(LDK(KP415415013), Tl, VFMA(LDK(KP841253532), Tj, VFNMS(LDK(KP654860733), Tk, VFNMS(LDK(KP959492973), Ti, VFNMS(LDK(KP142314838), Tm, Th)))));
Chris@82 256 ST(&(xo[WS(os, 3)]), VADD(To, Tp), ovs, &(xo[WS(os, 1)]));
Chris@82 257 ST(&(xo[WS(os, 8)]), VSUB(Tp, To), ovs, &(xo[0]));
Chris@82 258 }
Chris@82 259 }
Chris@82 260 }
Chris@82 261 VLEAVE();
Chris@82 262 }
Chris@82 263
Chris@82 264 static const kdft_desc desc = { 11, XSIMD_STRING("n1bv_11"), {30, 10, 40, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 265
Chris@82 266 void XSIMD(codelet_n1bv_11) (planner *p) {
Chris@82 267 X(kdft_register) (p, n1bv_11, &desc);
Chris@82 268 }
Chris@82 269
Chris@82 270 #endif