annotate src/fftw-3.3.3/dft/simd/common/n1bv_11.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:37:00 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 11 -name n1bv_11 -include n1b.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 70 FP additions, 60 FP multiplications,
Chris@10 32 * (or, 15 additions, 5 multiplications, 55 fused multiply/add),
Chris@10 33 * 67 stack variables, 11 constants, and 22 memory accesses
Chris@10 34 */
Chris@10 35 #include "n1b.h"
Chris@10 36
Chris@10 37 static void n1bv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
Chris@10 40 DVK(KP876768831, +0.876768831002589333891339807079336796764054852);
Chris@10 41 DVK(KP918985947, +0.918985947228994779780736114132655398124909697);
Chris@10 42 DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
Chris@10 43 DVK(KP778434453, +0.778434453334651800608337670740821884709317477);
Chris@10 44 DVK(KP830830026, +0.830830026003772851058548298459246407048009821);
Chris@10 45 DVK(KP372785597, +0.372785597771792209609773152906148328659002598);
Chris@10 46 DVK(KP634356270, +0.634356270682424498893150776899916060542806975);
Chris@10 47 DVK(KP715370323, +0.715370323453429719112414662767260662417897278);
Chris@10 48 DVK(KP342584725, +0.342584725681637509502641509861112333758894680);
Chris@10 49 DVK(KP521108558, +0.521108558113202722944698153526659300680427422);
Chris@10 50 {
Chris@10 51 INT i;
Chris@10 52 const R *xi;
Chris@10 53 R *xo;
Chris@10 54 xi = ii;
Chris@10 55 xo = io;
Chris@10 56 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
Chris@10 57 V T1, Tb, T4, Tq, Tg, Tm, T7, Tp, Ta, To, Tc, T11;
Chris@10 58 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 59 {
Chris@10 60 V T2, T3, Te, Tf;
Chris@10 61 T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 62 T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 63 Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 64 Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 65 {
Chris@10 66 V T5, T6, T8, T9;
Chris@10 67 T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 68 T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 69 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 70 T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 71 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 72 T4 = VADD(T2, T3);
Chris@10 73 Tq = VSUB(T2, T3);
Chris@10 74 Tg = VADD(Te, Tf);
Chris@10 75 Tm = VSUB(Te, Tf);
Chris@10 76 T7 = VADD(T5, T6);
Chris@10 77 Tp = VSUB(T5, T6);
Chris@10 78 Ta = VADD(T8, T9);
Chris@10 79 To = VSUB(T8, T9);
Chris@10 80 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 81 }
Chris@10 82 }
Chris@10 83 T11 = VFMA(LDK(KP521108558), Tm, Tq);
Chris@10 84 {
Chris@10 85 V TA, TS, TE, TW, Td, Tn, Ts, Tw, Tr, Tv, TT, TF;
Chris@10 86 Tr = VFNMS(LDK(KP521108558), Tq, Tp);
Chris@10 87 Tv = VFNMS(LDK(KP342584725), T7, Tg);
Chris@10 88 TA = VFMA(LDK(KP715370323), To, Tq);
Chris@10 89 TS = VFMA(LDK(KP521108558), To, Tm);
Chris@10 90 TE = VFNMS(LDK(KP342584725), T4, Ta);
Chris@10 91 TW = VFNMS(LDK(KP342584725), Ta, T7);
Chris@10 92 Td = VADD(Tb, Tc);
Chris@10 93 Tn = VSUB(Tb, Tc);
Chris@10 94 Ts = VFNMS(LDK(KP715370323), Tr, To);
Chris@10 95 Tw = VFNMS(LDK(KP634356270), Tv, T4);
Chris@10 96 TT = VFNMS(LDK(KP715370323), TS, Tp);
Chris@10 97 TF = VFNMS(LDK(KP634356270), TE, Tg);
Chris@10 98 {
Chris@10 99 V Tu, TV, TD, TL, T14, TP, TZ, Tj, Tz, TI, TB, TJ, TM;
Chris@10 100 TB = VFMA(LDK(KP372785597), Tn, TA);
Chris@10 101 TJ = VFNMS(LDK(KP521108558), Tp, Tn);
Chris@10 102 {
Chris@10 103 V T12, TN, TX, Th;
Chris@10 104 T12 = VFMA(LDK(KP715370323), T11, Tn);
Chris@10 105 ST(&(xo[0]), VADD(Tg, VADD(Td, VADD(Ta, VADD(T7, VADD(T4, T1))))), ovs, &(xo[0]));
Chris@10 106 TN = VFNMS(LDK(KP342584725), Td, T4);
Chris@10 107 TX = VFNMS(LDK(KP634356270), TW, Td);
Chris@10 108 Th = VFNMS(LDK(KP342584725), Tg, Td);
Chris@10 109 {
Chris@10 110 V Tt, Tx, TU, TG;
Chris@10 111 Tt = VFNMS(LDK(KP830830026), Ts, Tn);
Chris@10 112 Tx = VFNMS(LDK(KP778434453), Tw, Ta);
Chris@10 113 TU = VFMA(LDK(KP830830026), TT, Tq);
Chris@10 114 TG = VFNMS(LDK(KP778434453), TF, Td);
Chris@10 115 {
Chris@10 116 V TC, TK, T13, TO;
Chris@10 117 TC = VFNMS(LDK(KP830830026), TB, Tm);
Chris@10 118 TK = VFMA(LDK(KP715370323), TJ, Tm);
Chris@10 119 T13 = VFMA(LDK(KP830830026), T12, Tp);
Chris@10 120 TO = VFNMS(LDK(KP634356270), TN, T7);
Chris@10 121 {
Chris@10 122 V TY, Ti, Ty, TH;
Chris@10 123 TY = VFNMS(LDK(KP778434453), TX, T4);
Chris@10 124 Ti = VFNMS(LDK(KP634356270), Th, Ta);
Chris@10 125 Tu = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), Tt, Tm));
Chris@10 126 Ty = VFNMS(LDK(KP876768831), Tx, Td);
Chris@10 127 TV = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TU, Tn));
Chris@10 128 TH = VFNMS(LDK(KP876768831), TG, T7);
Chris@10 129 TD = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), TC, Tp));
Chris@10 130 TL = VFNMS(LDK(KP830830026), TK, To);
Chris@10 131 T14 = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), T13, To));
Chris@10 132 TP = VFNMS(LDK(KP778434453), TO, Tg);
Chris@10 133 TZ = VFNMS(LDK(KP876768831), TY, Tg);
Chris@10 134 Tj = VFNMS(LDK(KP778434453), Ti, T7);
Chris@10 135 Tz = VFNMS(LDK(KP959492973), Ty, T1);
Chris@10 136 TI = VFNMS(LDK(KP959492973), TH, T1);
Chris@10 137 }
Chris@10 138 }
Chris@10 139 }
Chris@10 140 }
Chris@10 141 TM = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TL, Tq));
Chris@10 142 {
Chris@10 143 V TQ, T10, Tk, TR, Tl;
Chris@10 144 TQ = VFNMS(LDK(KP876768831), TP, Ta);
Chris@10 145 T10 = VFNMS(LDK(KP959492973), TZ, T1);
Chris@10 146 Tk = VFNMS(LDK(KP876768831), Tj, T4);
Chris@10 147 ST(&(xo[WS(os, 7)]), VFMAI(TD, Tz), ovs, &(xo[WS(os, 1)]));
Chris@10 148 ST(&(xo[WS(os, 4)]), VFNMSI(TD, Tz), ovs, &(xo[0]));
Chris@10 149 ST(&(xo[WS(os, 8)]), VFNMSI(TM, TI), ovs, &(xo[0]));
Chris@10 150 ST(&(xo[WS(os, 3)]), VFMAI(TM, TI), ovs, &(xo[WS(os, 1)]));
Chris@10 151 TR = VFNMS(LDK(KP959492973), TQ, T1);
Chris@10 152 ST(&(xo[WS(os, 10)]), VFNMSI(T14, T10), ovs, &(xo[0]));
Chris@10 153 ST(&(xo[WS(os, 1)]), VFMAI(T14, T10), ovs, &(xo[WS(os, 1)]));
Chris@10 154 Tl = VFNMS(LDK(KP959492973), Tk, T1);
Chris@10 155 ST(&(xo[WS(os, 9)]), VFMAI(TV, TR), ovs, &(xo[WS(os, 1)]));
Chris@10 156 ST(&(xo[WS(os, 2)]), VFNMSI(TV, TR), ovs, &(xo[0]));
Chris@10 157 ST(&(xo[WS(os, 6)]), VFNMSI(Tu, Tl), ovs, &(xo[0]));
Chris@10 158 ST(&(xo[WS(os, 5)]), VFMAI(Tu, Tl), ovs, &(xo[WS(os, 1)]));
Chris@10 159 }
Chris@10 160 }
Chris@10 161 }
Chris@10 162 }
Chris@10 163 }
Chris@10 164 VLEAVE();
Chris@10 165 }
Chris@10 166
Chris@10 167 static const kdft_desc desc = { 11, XSIMD_STRING("n1bv_11"), {15, 5, 55, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 168
Chris@10 169 void XSIMD(codelet_n1bv_11) (planner *p) {
Chris@10 170 X(kdft_register) (p, n1bv_11, &desc);
Chris@10 171 }
Chris@10 172
Chris@10 173 #else /* HAVE_FMA */
Chris@10 174
Chris@10 175 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 11 -name n1bv_11 -include n1b.h */
Chris@10 176
Chris@10 177 /*
Chris@10 178 * This function contains 70 FP additions, 50 FP multiplications,
Chris@10 179 * (or, 30 additions, 10 multiplications, 40 fused multiply/add),
Chris@10 180 * 32 stack variables, 10 constants, and 22 memory accesses
Chris@10 181 */
Chris@10 182 #include "n1b.h"
Chris@10 183
Chris@10 184 static void n1bv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 185 {
Chris@10 186 DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
Chris@10 187 DVK(KP654860733, +0.654860733945285064056925072466293553183791199);
Chris@10 188 DVK(KP142314838, +0.142314838273285140443792668616369668791051361);
Chris@10 189 DVK(KP415415013, +0.415415013001886425529274149229623203524004910);
Chris@10 190 DVK(KP841253532, +0.841253532831181168861811648919367717513292498);
Chris@10 191 DVK(KP540640817, +0.540640817455597582107635954318691695431770608);
Chris@10 192 DVK(KP909631995, +0.909631995354518371411715383079028460060241051);
Chris@10 193 DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
Chris@10 194 DVK(KP755749574, +0.755749574354258283774035843972344420179717445);
Chris@10 195 DVK(KP281732556, +0.281732556841429697711417915346616899035777899);
Chris@10 196 {
Chris@10 197 INT i;
Chris@10 198 const R *xi;
Chris@10 199 R *xo;
Chris@10 200 xi = ii;
Chris@10 201 xo = io;
Chris@10 202 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
Chris@10 203 V Th, T3, Tm, Tf, Ti, Tc, Tj, T9, Tk, T6, Tl, Ta, Tb, Ts, Tt;
Chris@10 204 Th = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 205 {
Chris@10 206 V T1, T2, Td, Te;
Chris@10 207 T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 208 T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 209 T3 = VSUB(T1, T2);
Chris@10 210 Tm = VADD(T1, T2);
Chris@10 211 Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 212 Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 213 Tf = VSUB(Td, Te);
Chris@10 214 Ti = VADD(Td, Te);
Chris@10 215 }
Chris@10 216 Ta = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 217 Tb = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 218 Tc = VSUB(Ta, Tb);
Chris@10 219 Tj = VADD(Ta, Tb);
Chris@10 220 {
Chris@10 221 V T7, T8, T4, T5;
Chris@10 222 T7 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 223 T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 224 T9 = VSUB(T7, T8);
Chris@10 225 Tk = VADD(T7, T8);
Chris@10 226 T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 227 T5 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 228 T6 = VSUB(T4, T5);
Chris@10 229 Tl = VADD(T4, T5);
Chris@10 230 }
Chris@10 231 ST(&(xo[0]), VADD(Th, VADD(Tm, VADD(Ti, VADD(Tl, VADD(Tj, Tk))))), ovs, &(xo[0]));
Chris@10 232 {
Chris@10 233 V Tg, Tn, Tu, Tv;
Chris@10 234 Tg = VBYI(VFMA(LDK(KP281732556), T3, VFMA(LDK(KP755749574), T6, VFNMS(LDK(KP909631995), Tc, VFNMS(LDK(KP540640817), Tf, VMUL(LDK(KP989821441), T9))))));
Chris@10 235 Tn = VFMA(LDK(KP841253532), Ti, VFMA(LDK(KP415415013), Tj, VFNMS(LDK(KP142314838), Tk, VFNMS(LDK(KP654860733), Tl, VFNMS(LDK(KP959492973), Tm, Th)))));
Chris@10 236 ST(&(xo[WS(os, 5)]), VADD(Tg, Tn), ovs, &(xo[WS(os, 1)]));
Chris@10 237 ST(&(xo[WS(os, 6)]), VSUB(Tn, Tg), ovs, &(xo[0]));
Chris@10 238 Tu = VBYI(VFMA(LDK(KP755749574), T3, VFMA(LDK(KP540640817), T6, VFNMS(LDK(KP909631995), T9, VFNMS(LDK(KP989821441), Tf, VMUL(LDK(KP281732556), Tc))))));
Chris@10 239 Tv = VFMA(LDK(KP841253532), Tl, VFMA(LDK(KP415415013), Tk, VFNMS(LDK(KP959492973), Tj, VFNMS(LDK(KP142314838), Ti, VFNMS(LDK(KP654860733), Tm, Th)))));
Chris@10 240 ST(&(xo[WS(os, 4)]), VADD(Tu, Tv), ovs, &(xo[0]));
Chris@10 241 ST(&(xo[WS(os, 7)]), VSUB(Tv, Tu), ovs, &(xo[WS(os, 1)]));
Chris@10 242 }
Chris@10 243 Ts = VBYI(VFMA(LDK(KP909631995), T3, VFNMS(LDK(KP540640817), T9, VFNMS(LDK(KP989821441), Tc, VFNMS(LDK(KP281732556), T6, VMUL(LDK(KP755749574), Tf))))));
Chris@10 244 Tt = VFMA(LDK(KP415415013), Tm, VFMA(LDK(KP841253532), Tk, VFNMS(LDK(KP142314838), Tj, VFNMS(LDK(KP959492973), Tl, VFNMS(LDK(KP654860733), Ti, Th)))));
Chris@10 245 ST(&(xo[WS(os, 2)]), VADD(Ts, Tt), ovs, &(xo[0]));
Chris@10 246 ST(&(xo[WS(os, 9)]), VSUB(Tt, Ts), ovs, &(xo[WS(os, 1)]));
Chris@10 247 {
Chris@10 248 V Tq, Tr, To, Tp;
Chris@10 249 Tq = VBYI(VFMA(LDK(KP540640817), T3, VFMA(LDK(KP909631995), Tf, VFMA(LDK(KP989821441), T6, VFMA(LDK(KP755749574), Tc, VMUL(LDK(KP281732556), T9))))));
Chris@10 250 Tr = VFMA(LDK(KP841253532), Tm, VFMA(LDK(KP415415013), Ti, VFNMS(LDK(KP959492973), Tk, VFNMS(LDK(KP654860733), Tj, VFNMS(LDK(KP142314838), Tl, Th)))));
Chris@10 251 ST(&(xo[WS(os, 1)]), VADD(Tq, Tr), ovs, &(xo[WS(os, 1)]));
Chris@10 252 ST(&(xo[WS(os, 10)]), VSUB(Tr, Tq), ovs, &(xo[0]));
Chris@10 253 To = VBYI(VFMA(LDK(KP989821441), T3, VFMA(LDK(KP540640817), Tc, VFNMS(LDK(KP909631995), T6, VFNMS(LDK(KP281732556), Tf, VMUL(LDK(KP755749574), T9))))));
Chris@10 254 Tp = VFMA(LDK(KP415415013), Tl, VFMA(LDK(KP841253532), Tj, VFNMS(LDK(KP654860733), Tk, VFNMS(LDK(KP959492973), Ti, VFNMS(LDK(KP142314838), Tm, Th)))));
Chris@10 255 ST(&(xo[WS(os, 3)]), VADD(To, Tp), ovs, &(xo[WS(os, 1)]));
Chris@10 256 ST(&(xo[WS(os, 8)]), VSUB(Tp, To), ovs, &(xo[0]));
Chris@10 257 }
Chris@10 258 }
Chris@10 259 }
Chris@10 260 VLEAVE();
Chris@10 261 }
Chris@10 262
Chris@10 263 static const kdft_desc desc = { 11, XSIMD_STRING("n1bv_11"), {30, 10, 40, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 264
Chris@10 265 void XSIMD(codelet_n1bv_11) (planner *p) {
Chris@10 266 X(kdft_register) (p, n1bv_11, &desc);
Chris@10 267 }
Chris@10 268
Chris@10 269 #endif /* HAVE_FMA */