annotate src/fftw-3.3.8/dft/simd/common/n1fv_11.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:51 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 11 -name n1fv_11 -include dft/simd/n1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 70 FP additions, 60 FP multiplications,
Chris@82 32 * (or, 15 additions, 5 multiplications, 55 fused multiply/add),
Chris@82 33 * 42 stack variables, 11 constants, and 22 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1f.h"
Chris@82 36
Chris@82 37 static void n1fv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
Chris@82 40 DVK(KP918985947, +0.918985947228994779780736114132655398124909697);
Chris@82 41 DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
Chris@82 42 DVK(KP830830026, +0.830830026003772851058548298459246407048009821);
Chris@82 43 DVK(KP876768831, +0.876768831002589333891339807079336796764054852);
Chris@82 44 DVK(KP778434453, +0.778434453334651800608337670740821884709317477);
Chris@82 45 DVK(KP372785597, +0.372785597771792209609773152906148328659002598);
Chris@82 46 DVK(KP715370323, +0.715370323453429719112414662767260662417897278);
Chris@82 47 DVK(KP521108558, +0.521108558113202722944698153526659300680427422);
Chris@82 48 DVK(KP634356270, +0.634356270682424498893150776899916060542806975);
Chris@82 49 DVK(KP342584725, +0.342584725681637509502641509861112333758894680);
Chris@82 50 {
Chris@82 51 INT i;
Chris@82 52 const R *xi;
Chris@82 53 R *xo;
Chris@82 54 xi = ri;
Chris@82 55 xo = ro;
Chris@82 56 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
Chris@82 57 V T1, T4, Tp, Tg, Tq, T7, Tn, Ta, Tm, Td, To, Ti, Tw, T12, Ts;
Chris@82 58 V TX, TT, TK, TB, TO, TF, T5, T6;
Chris@82 59 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 60 {
Chris@82 61 V T2, T3, Te, Tf;
Chris@82 62 T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 63 T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 64 T4 = VADD(T2, T3);
Chris@82 65 Tp = VSUB(T3, T2);
Chris@82 66 Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 67 Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 68 Tg = VADD(Te, Tf);
Chris@82 69 Tq = VSUB(Tf, Te);
Chris@82 70 }
Chris@82 71 T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 72 T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 73 T7 = VADD(T5, T6);
Chris@82 74 Tn = VSUB(T6, T5);
Chris@82 75 {
Chris@82 76 V T8, T9, Tb, Tc;
Chris@82 77 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 78 T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 79 Ta = VADD(T8, T9);
Chris@82 80 Tm = VSUB(T9, T8);
Chris@82 81 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 82 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 83 Td = VADD(Tb, Tc);
Chris@82 84 To = VSUB(Tc, Tb);
Chris@82 85 }
Chris@82 86 {
Chris@82 87 V Th, Tv, T11, Tr, TW;
Chris@82 88 Th = VFNMS(LDK(KP342584725), Ta, T7);
Chris@82 89 Ti = VFNMS(LDK(KP634356270), Th, Td);
Chris@82 90 Tv = VFNMS(LDK(KP342584725), Td, T4);
Chris@82 91 Tw = VFNMS(LDK(KP634356270), Tv, T7);
Chris@82 92 T11 = VFNMS(LDK(KP521108558), Tp, Tn);
Chris@82 93 T12 = VFNMS(LDK(KP715370323), T11, Tm);
Chris@82 94 Tr = VFMA(LDK(KP521108558), Tq, Tp);
Chris@82 95 Ts = VFMA(LDK(KP715370323), Tr, To);
Chris@82 96 TW = VFNMS(LDK(KP342584725), Tg, Td);
Chris@82 97 TX = VFNMS(LDK(KP634356270), TW, Ta);
Chris@82 98 }
Chris@82 99 {
Chris@82 100 V TS, TJ, TA, TN, TE;
Chris@82 101 TS = VFMA(LDK(KP715370323), Tm, Tp);
Chris@82 102 TT = VFMA(LDK(KP372785597), To, TS);
Chris@82 103 TJ = VFNMS(LDK(KP521108558), Tn, To);
Chris@82 104 TK = VFMA(LDK(KP715370323), TJ, Tq);
Chris@82 105 TA = VFMA(LDK(KP521108558), Tm, Tq);
Chris@82 106 TB = VFNMS(LDK(KP715370323), TA, Tn);
Chris@82 107 TN = VFNMS(LDK(KP342584725), T7, Tg);
Chris@82 108 TO = VFNMS(LDK(KP634356270), TN, T4);
Chris@82 109 TE = VFNMS(LDK(KP342584725), T4, Ta);
Chris@82 110 TF = VFNMS(LDK(KP634356270), TE, Tg);
Chris@82 111 }
Chris@82 112 ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, VADD(Ta, VADD(Td, Tg))))), ovs, &(xo[0]));
Chris@82 113 {
Chris@82 114 V Tk, Tu, Tj, Tt, Tl;
Chris@82 115 Tj = VFNMS(LDK(KP778434453), Ti, T4);
Chris@82 116 Tk = VFNMS(LDK(KP876768831), Tj, Tg);
Chris@82 117 Tt = VFMA(LDK(KP830830026), Ts, Tn);
Chris@82 118 Tu = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), Tt, Tm));
Chris@82 119 Tl = VFNMS(LDK(KP959492973), Tk, T1);
Chris@82 120 ST(&(xo[WS(os, 10)]), VFNMSI(Tu, Tl), ovs, &(xo[0]));
Chris@82 121 ST(&(xo[WS(os, 1)]), VFMAI(Tu, Tl), ovs, &(xo[WS(os, 1)]));
Chris@82 122 }
Chris@82 123 {
Chris@82 124 V TZ, T14, TY, T13, T10;
Chris@82 125 TY = VFNMS(LDK(KP778434453), TX, T7);
Chris@82 126 TZ = VFNMS(LDK(KP876768831), TY, T4);
Chris@82 127 T13 = VFNMS(LDK(KP830830026), T12, To);
Chris@82 128 T14 = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), T13, Tq));
Chris@82 129 T10 = VFNMS(LDK(KP959492973), TZ, T1);
Chris@82 130 ST(&(xo[WS(os, 6)]), VFNMSI(T14, T10), ovs, &(xo[0]));
Chris@82 131 ST(&(xo[WS(os, 5)]), VFMAI(T14, T10), ovs, &(xo[WS(os, 1)]));
Chris@82 132 }
Chris@82 133 {
Chris@82 134 V TQ, TV, TP, TU, TR;
Chris@82 135 TP = VFNMS(LDK(KP778434453), TO, Ta);
Chris@82 136 TQ = VFNMS(LDK(KP876768831), TP, Td);
Chris@82 137 TU = VFNMS(LDK(KP830830026), TT, Tq);
Chris@82 138 TV = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), TU, Tn));
Chris@82 139 TR = VFNMS(LDK(KP959492973), TQ, T1);
Chris@82 140 ST(&(xo[WS(os, 4)]), VFNMSI(TV, TR), ovs, &(xo[0]));
Chris@82 141 ST(&(xo[WS(os, 7)]), VFMAI(TV, TR), ovs, &(xo[WS(os, 1)]));
Chris@82 142 }
Chris@82 143 {
Chris@82 144 V TH, TM, TG, TL, TI;
Chris@82 145 TG = VFNMS(LDK(KP778434453), TF, Td);
Chris@82 146 TH = VFNMS(LDK(KP876768831), TG, T7);
Chris@82 147 TL = VFNMS(LDK(KP830830026), TK, Tm);
Chris@82 148 TM = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TL, Tp));
Chris@82 149 TI = VFNMS(LDK(KP959492973), TH, T1);
Chris@82 150 ST(&(xo[WS(os, 8)]), VFNMSI(TM, TI), ovs, &(xo[0]));
Chris@82 151 ST(&(xo[WS(os, 3)]), VFMAI(TM, TI), ovs, &(xo[WS(os, 1)]));
Chris@82 152 }
Chris@82 153 {
Chris@82 154 V Ty, TD, Tx, TC, Tz;
Chris@82 155 Tx = VFNMS(LDK(KP778434453), Tw, Tg);
Chris@82 156 Ty = VFNMS(LDK(KP876768831), Tx, Ta);
Chris@82 157 TC = VFMA(LDK(KP830830026), TB, Tp);
Chris@82 158 TD = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TC, To));
Chris@82 159 Tz = VFNMS(LDK(KP959492973), Ty, T1);
Chris@82 160 ST(&(xo[WS(os, 2)]), VFNMSI(TD, Tz), ovs, &(xo[0]));
Chris@82 161 ST(&(xo[WS(os, 9)]), VFMAI(TD, Tz), ovs, &(xo[WS(os, 1)]));
Chris@82 162 }
Chris@82 163 }
Chris@82 164 }
Chris@82 165 VLEAVE();
Chris@82 166 }
Chris@82 167
Chris@82 168 static const kdft_desc desc = { 11, XSIMD_STRING("n1fv_11"), {15, 5, 55, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 169
Chris@82 170 void XSIMD(codelet_n1fv_11) (planner *p) {
Chris@82 171 X(kdft_register) (p, n1fv_11, &desc);
Chris@82 172 }
Chris@82 173
Chris@82 174 #else
Chris@82 175
Chris@82 176 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 11 -name n1fv_11 -include dft/simd/n1f.h */
Chris@82 177
Chris@82 178 /*
Chris@82 179 * This function contains 70 FP additions, 50 FP multiplications,
Chris@82 180 * (or, 30 additions, 10 multiplications, 40 fused multiply/add),
Chris@82 181 * 32 stack variables, 10 constants, and 22 memory accesses
Chris@82 182 */
Chris@82 183 #include "dft/simd/n1f.h"
Chris@82 184
Chris@82 185 static void n1fv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 186 {
Chris@82 187 DVK(KP654860733, +0.654860733945285064056925072466293553183791199);
Chris@82 188 DVK(KP142314838, +0.142314838273285140443792668616369668791051361);
Chris@82 189 DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
Chris@82 190 DVK(KP415415013, +0.415415013001886425529274149229623203524004910);
Chris@82 191 DVK(KP841253532, +0.841253532831181168861811648919367717513292498);
Chris@82 192 DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
Chris@82 193 DVK(KP909631995, +0.909631995354518371411715383079028460060241051);
Chris@82 194 DVK(KP281732556, +0.281732556841429697711417915346616899035777899);
Chris@82 195 DVK(KP540640817, +0.540640817455597582107635954318691695431770608);
Chris@82 196 DVK(KP755749574, +0.755749574354258283774035843972344420179717445);
Chris@82 197 {
Chris@82 198 INT i;
Chris@82 199 const R *xi;
Chris@82 200 R *xo;
Chris@82 201 xi = ri;
Chris@82 202 xo = ro;
Chris@82 203 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
Chris@82 204 V T1, T4, Ti, Tg, Tl, Td, Tk, Ta, Tj, T7, Tm, Tb, Tc, Tt, Ts;
Chris@82 205 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 206 {
Chris@82 207 V T2, T3, Te, Tf;
Chris@82 208 T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 209 T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 210 T4 = VADD(T2, T3);
Chris@82 211 Ti = VSUB(T3, T2);
Chris@82 212 Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 213 Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 214 Tg = VADD(Te, Tf);
Chris@82 215 Tl = VSUB(Tf, Te);
Chris@82 216 }
Chris@82 217 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 218 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 219 Td = VADD(Tb, Tc);
Chris@82 220 Tk = VSUB(Tc, Tb);
Chris@82 221 {
Chris@82 222 V T8, T9, T5, T6;
Chris@82 223 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 224 T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 225 Ta = VADD(T8, T9);
Chris@82 226 Tj = VSUB(T9, T8);
Chris@82 227 T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 228 T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 229 T7 = VADD(T5, T6);
Chris@82 230 Tm = VSUB(T6, T5);
Chris@82 231 }
Chris@82 232 ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, VADD(Ta, VADD(Td, Tg))))), ovs, &(xo[0]));
Chris@82 233 {
Chris@82 234 V Tn, Th, Tv, Tu;
Chris@82 235 Tn = VBYI(VFMA(LDK(KP755749574), Ti, VFMA(LDK(KP540640817), Tj, VFNMS(LDK(KP909631995), Tl, VFNMS(LDK(KP989821441), Tm, VMUL(LDK(KP281732556), Tk))))));
Chris@82 236 Th = VFMA(LDK(KP841253532), Ta, VFMA(LDK(KP415415013), Tg, VFNMS(LDK(KP959492973), Td, VFNMS(LDK(KP142314838), T7, VFNMS(LDK(KP654860733), T4, T1)))));
Chris@82 237 ST(&(xo[WS(os, 7)]), VSUB(Th, Tn), ovs, &(xo[WS(os, 1)]));
Chris@82 238 ST(&(xo[WS(os, 4)]), VADD(Th, Tn), ovs, &(xo[0]));
Chris@82 239 Tv = VBYI(VFMA(LDK(KP281732556), Ti, VFMA(LDK(KP755749574), Tj, VFNMS(LDK(KP909631995), Tk, VFNMS(LDK(KP540640817), Tm, VMUL(LDK(KP989821441), Tl))))));
Chris@82 240 Tu = VFMA(LDK(KP841253532), T7, VFMA(LDK(KP415415013), Td, VFNMS(LDK(KP142314838), Tg, VFNMS(LDK(KP654860733), Ta, VFNMS(LDK(KP959492973), T4, T1)))));
Chris@82 241 ST(&(xo[WS(os, 6)]), VSUB(Tu, Tv), ovs, &(xo[0]));
Chris@82 242 ST(&(xo[WS(os, 5)]), VADD(Tu, Tv), ovs, &(xo[WS(os, 1)]));
Chris@82 243 }
Chris@82 244 Tt = VBYI(VFMA(LDK(KP989821441), Ti, VFMA(LDK(KP540640817), Tk, VFNMS(LDK(KP909631995), Tj, VFNMS(LDK(KP281732556), Tm, VMUL(LDK(KP755749574), Tl))))));
Chris@82 245 Ts = VFMA(LDK(KP415415013), Ta, VFMA(LDK(KP841253532), Td, VFNMS(LDK(KP654860733), Tg, VFNMS(LDK(KP959492973), T7, VFNMS(LDK(KP142314838), T4, T1)))));
Chris@82 246 ST(&(xo[WS(os, 8)]), VSUB(Ts, Tt), ovs, &(xo[0]));
Chris@82 247 ST(&(xo[WS(os, 3)]), VADD(Ts, Tt), ovs, &(xo[WS(os, 1)]));
Chris@82 248 {
Chris@82 249 V Tr, Tq, Tp, To;
Chris@82 250 Tr = VBYI(VFMA(LDK(KP540640817), Ti, VFMA(LDK(KP909631995), Tm, VFMA(LDK(KP989821441), Tj, VFMA(LDK(KP755749574), Tk, VMUL(LDK(KP281732556), Tl))))));
Chris@82 251 Tq = VFMA(LDK(KP841253532), T4, VFMA(LDK(KP415415013), T7, VFNMS(LDK(KP959492973), Tg, VFNMS(LDK(KP654860733), Td, VFNMS(LDK(KP142314838), Ta, T1)))));
Chris@82 252 ST(&(xo[WS(os, 10)]), VSUB(Tq, Tr), ovs, &(xo[0]));
Chris@82 253 ST(&(xo[WS(os, 1)]), VADD(Tq, Tr), ovs, &(xo[WS(os, 1)]));
Chris@82 254 Tp = VBYI(VFMA(LDK(KP909631995), Ti, VFNMS(LDK(KP540640817), Tl, VFNMS(LDK(KP989821441), Tk, VFNMS(LDK(KP281732556), Tj, VMUL(LDK(KP755749574), Tm))))));
Chris@82 255 To = VFMA(LDK(KP415415013), T4, VFMA(LDK(KP841253532), Tg, VFNMS(LDK(KP142314838), Td, VFNMS(LDK(KP959492973), Ta, VFNMS(LDK(KP654860733), T7, T1)))));
Chris@82 256 ST(&(xo[WS(os, 9)]), VSUB(To, Tp), ovs, &(xo[WS(os, 1)]));
Chris@82 257 ST(&(xo[WS(os, 2)]), VADD(To, Tp), ovs, &(xo[0]));
Chris@82 258 }
Chris@82 259 }
Chris@82 260 }
Chris@82 261 VLEAVE();
Chris@82 262 }
Chris@82 263
Chris@82 264 static const kdft_desc desc = { 11, XSIMD_STRING("n1fv_11"), {30, 10, 40, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 265
Chris@82 266 void XSIMD(codelet_n1fv_11) (planner *p) {
Chris@82 267 X(kdft_register) (p, n1fv_11, &desc);
Chris@82 268 }
Chris@82 269
Chris@82 270 #endif