annotate src/fftw-3.3.5/dft/simd/common/n1fv_11.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:38:40 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 11 -name n1fv_11 -include n1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 70 FP additions, 60 FP multiplications,
Chris@42 32 * (or, 15 additions, 5 multiplications, 55 fused multiply/add),
Chris@42 33 * 67 stack variables, 11 constants, and 22 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1f.h"
Chris@42 36
Chris@42 37 static void n1fv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
Chris@42 40 DVK(KP876768831, +0.876768831002589333891339807079336796764054852);
Chris@42 41 DVK(KP918985947, +0.918985947228994779780736114132655398124909697);
Chris@42 42 DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
Chris@42 43 DVK(KP778434453, +0.778434453334651800608337670740821884709317477);
Chris@42 44 DVK(KP830830026, +0.830830026003772851058548298459246407048009821);
Chris@42 45 DVK(KP372785597, +0.372785597771792209609773152906148328659002598);
Chris@42 46 DVK(KP634356270, +0.634356270682424498893150776899916060542806975);
Chris@42 47 DVK(KP715370323, +0.715370323453429719112414662767260662417897278);
Chris@42 48 DVK(KP342584725, +0.342584725681637509502641509861112333758894680);
Chris@42 49 DVK(KP521108558, +0.521108558113202722944698153526659300680427422);
Chris@42 50 {
Chris@42 51 INT i;
Chris@42 52 const R *xi;
Chris@42 53 R *xo;
Chris@42 54 xi = ri;
Chris@42 55 xo = ro;
Chris@42 56 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
Chris@42 57 V T1, Tb, T4, Tp, Tg, Tq, T7, Tn, Ta, Tm, Tc, Tr;
Chris@42 58 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 59 {
Chris@42 60 V T2, T3, Te, Tf;
Chris@42 61 T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 62 T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 63 Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 64 Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 65 {
Chris@42 66 V T5, T6, T8, T9;
Chris@42 67 T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 68 T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 69 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 70 T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 71 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 72 T4 = VADD(T2, T3);
Chris@42 73 Tp = VSUB(T3, T2);
Chris@42 74 Tg = VADD(Te, Tf);
Chris@42 75 Tq = VSUB(Tf, Te);
Chris@42 76 T7 = VADD(T5, T6);
Chris@42 77 Tn = VSUB(T6, T5);
Chris@42 78 Ta = VADD(T8, T9);
Chris@42 79 Tm = VSUB(T9, T8);
Chris@42 80 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 81 }
Chris@42 82 }
Chris@42 83 Tr = VFMA(LDK(KP521108558), Tq, Tp);
Chris@42 84 {
Chris@42 85 V TS, TE, Th, Td, To, T12, TO, TB, T11, TN, TA, TF;
Chris@42 86 T11 = VFNMS(LDK(KP521108558), Tp, Tn);
Chris@42 87 TN = VFNMS(LDK(KP342584725), T7, Tg);
Chris@42 88 TA = VFMA(LDK(KP521108558), Tm, Tq);
Chris@42 89 TS = VFMA(LDK(KP715370323), Tm, Tp);
Chris@42 90 TE = VFNMS(LDK(KP342584725), T4, Ta);
Chris@42 91 Th = VFNMS(LDK(KP342584725), Ta, T7);
Chris@42 92 Td = VADD(Tb, Tc);
Chris@42 93 To = VSUB(Tc, Tb);
Chris@42 94 T12 = VFNMS(LDK(KP715370323), T11, Tm);
Chris@42 95 TO = VFNMS(LDK(KP634356270), TN, T4);
Chris@42 96 TB = VFNMS(LDK(KP715370323), TA, Tn);
Chris@42 97 TF = VFNMS(LDK(KP634356270), TE, Tg);
Chris@42 98 {
Chris@42 99 V T14, TD, TV, Tu, TY, Tx, Tk, TR, TI, TM, TJ, TT, Ts;
Chris@42 100 TJ = VFNMS(LDK(KP521108558), Tn, To);
Chris@42 101 TT = VFMA(LDK(KP372785597), To, TS);
Chris@42 102 Ts = VFMA(LDK(KP715370323), Tr, To);
Chris@42 103 ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, VADD(Ta, VADD(Td, Tg))))), ovs, &(xo[0]));
Chris@42 104 {
Chris@42 105 V TW, Tv, Ti, T13;
Chris@42 106 TW = VFNMS(LDK(KP342584725), Tg, Td);
Chris@42 107 Tv = VFNMS(LDK(KP342584725), Td, T4);
Chris@42 108 Ti = VFNMS(LDK(KP634356270), Th, Td);
Chris@42 109 T13 = VFNMS(LDK(KP830830026), T12, To);
Chris@42 110 {
Chris@42 111 V TP, TC, TG, TK;
Chris@42 112 TP = VFNMS(LDK(KP778434453), TO, Ta);
Chris@42 113 TC = VFMA(LDK(KP830830026), TB, Tp);
Chris@42 114 TG = VFNMS(LDK(KP778434453), TF, Td);
Chris@42 115 TK = VFMA(LDK(KP715370323), TJ, Tq);
Chris@42 116 {
Chris@42 117 V TU, Tt, TX, Tw;
Chris@42 118 TU = VFNMS(LDK(KP830830026), TT, Tq);
Chris@42 119 Tt = VFMA(LDK(KP830830026), Ts, Tn);
Chris@42 120 TX = VFNMS(LDK(KP634356270), TW, Ta);
Chris@42 121 Tw = VFNMS(LDK(KP634356270), Tv, T7);
Chris@42 122 {
Chris@42 123 V Tj, TQ, TH, TL;
Chris@42 124 Tj = VFNMS(LDK(KP778434453), Ti, T4);
Chris@42 125 T14 = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), T13, Tq));
Chris@42 126 TQ = VFNMS(LDK(KP876768831), TP, Td);
Chris@42 127 TD = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TC, To));
Chris@42 128 TH = VFNMS(LDK(KP876768831), TG, T7);
Chris@42 129 TL = VFNMS(LDK(KP830830026), TK, Tm);
Chris@42 130 TV = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), TU, Tn));
Chris@42 131 Tu = VMUL(LDK(KP989821441), VFMA(LDK(KP918985947), Tt, Tm));
Chris@42 132 TY = VFNMS(LDK(KP778434453), TX, T7);
Chris@42 133 Tx = VFNMS(LDK(KP778434453), Tw, Tg);
Chris@42 134 Tk = VFNMS(LDK(KP876768831), Tj, Tg);
Chris@42 135 TR = VFNMS(LDK(KP959492973), TQ, T1);
Chris@42 136 TI = VFNMS(LDK(KP959492973), TH, T1);
Chris@42 137 TM = VMUL(LDK(KP989821441), VFNMS(LDK(KP918985947), TL, Tp));
Chris@42 138 }
Chris@42 139 }
Chris@42 140 }
Chris@42 141 }
Chris@42 142 {
Chris@42 143 V TZ, Ty, Tl, T10, Tz;
Chris@42 144 TZ = VFNMS(LDK(KP876768831), TY, T4);
Chris@42 145 Ty = VFNMS(LDK(KP876768831), Tx, Ta);
Chris@42 146 Tl = VFNMS(LDK(KP959492973), Tk, T1);
Chris@42 147 ST(&(xo[WS(os, 7)]), VFMAI(TV, TR), ovs, &(xo[WS(os, 1)]));
Chris@42 148 ST(&(xo[WS(os, 4)]), VFNMSI(TV, TR), ovs, &(xo[0]));
Chris@42 149 ST(&(xo[WS(os, 3)]), VFMAI(TM, TI), ovs, &(xo[WS(os, 1)]));
Chris@42 150 ST(&(xo[WS(os, 8)]), VFNMSI(TM, TI), ovs, &(xo[0]));
Chris@42 151 T10 = VFNMS(LDK(KP959492973), TZ, T1);
Chris@42 152 Tz = VFNMS(LDK(KP959492973), Ty, T1);
Chris@42 153 ST(&(xo[WS(os, 1)]), VFMAI(Tu, Tl), ovs, &(xo[WS(os, 1)]));
Chris@42 154 ST(&(xo[WS(os, 10)]), VFNMSI(Tu, Tl), ovs, &(xo[0]));
Chris@42 155 ST(&(xo[WS(os, 5)]), VFMAI(T14, T10), ovs, &(xo[WS(os, 1)]));
Chris@42 156 ST(&(xo[WS(os, 6)]), VFNMSI(T14, T10), ovs, &(xo[0]));
Chris@42 157 ST(&(xo[WS(os, 9)]), VFMAI(TD, Tz), ovs, &(xo[WS(os, 1)]));
Chris@42 158 ST(&(xo[WS(os, 2)]), VFNMSI(TD, Tz), ovs, &(xo[0]));
Chris@42 159 }
Chris@42 160 }
Chris@42 161 }
Chris@42 162 }
Chris@42 163 }
Chris@42 164 VLEAVE();
Chris@42 165 }
Chris@42 166
Chris@42 167 static const kdft_desc desc = { 11, XSIMD_STRING("n1fv_11"), {15, 5, 55, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 168
Chris@42 169 void XSIMD(codelet_n1fv_11) (planner *p) {
Chris@42 170 X(kdft_register) (p, n1fv_11, &desc);
Chris@42 171 }
Chris@42 172
Chris@42 173 #else /* HAVE_FMA */
Chris@42 174
Chris@42 175 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 11 -name n1fv_11 -include n1f.h */
Chris@42 176
Chris@42 177 /*
Chris@42 178 * This function contains 70 FP additions, 50 FP multiplications,
Chris@42 179 * (or, 30 additions, 10 multiplications, 40 fused multiply/add),
Chris@42 180 * 32 stack variables, 10 constants, and 22 memory accesses
Chris@42 181 */
Chris@42 182 #include "n1f.h"
Chris@42 183
Chris@42 184 static void n1fv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 185 {
Chris@42 186 DVK(KP654860733, +0.654860733945285064056925072466293553183791199);
Chris@42 187 DVK(KP142314838, +0.142314838273285140443792668616369668791051361);
Chris@42 188 DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
Chris@42 189 DVK(KP415415013, +0.415415013001886425529274149229623203524004910);
Chris@42 190 DVK(KP841253532, +0.841253532831181168861811648919367717513292498);
Chris@42 191 DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
Chris@42 192 DVK(KP909631995, +0.909631995354518371411715383079028460060241051);
Chris@42 193 DVK(KP281732556, +0.281732556841429697711417915346616899035777899);
Chris@42 194 DVK(KP540640817, +0.540640817455597582107635954318691695431770608);
Chris@42 195 DVK(KP755749574, +0.755749574354258283774035843972344420179717445);
Chris@42 196 {
Chris@42 197 INT i;
Chris@42 198 const R *xi;
Chris@42 199 R *xo;
Chris@42 200 xi = ri;
Chris@42 201 xo = ro;
Chris@42 202 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(22, is), MAKE_VOLATILE_STRIDE(22, os)) {
Chris@42 203 V T1, T4, Ti, Tg, Tl, Td, Tk, Ta, Tj, T7, Tm, Tb, Tc, Tt, Ts;
Chris@42 204 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 205 {
Chris@42 206 V T2, T3, Te, Tf;
Chris@42 207 T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 208 T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 209 T4 = VADD(T2, T3);
Chris@42 210 Ti = VSUB(T3, T2);
Chris@42 211 Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 212 Tf = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 213 Tg = VADD(Te, Tf);
Chris@42 214 Tl = VSUB(Tf, Te);
Chris@42 215 }
Chris@42 216 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 217 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 218 Td = VADD(Tb, Tc);
Chris@42 219 Tk = VSUB(Tc, Tb);
Chris@42 220 {
Chris@42 221 V T8, T9, T5, T6;
Chris@42 222 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 223 T9 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 224 Ta = VADD(T8, T9);
Chris@42 225 Tj = VSUB(T9, T8);
Chris@42 226 T5 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 227 T6 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 228 T7 = VADD(T5, T6);
Chris@42 229 Tm = VSUB(T6, T5);
Chris@42 230 }
Chris@42 231 ST(&(xo[0]), VADD(T1, VADD(T4, VADD(T7, VADD(Ta, VADD(Td, Tg))))), ovs, &(xo[0]));
Chris@42 232 {
Chris@42 233 V Tn, Th, Tv, Tu;
Chris@42 234 Tn = VBYI(VFMA(LDK(KP755749574), Ti, VFMA(LDK(KP540640817), Tj, VFNMS(LDK(KP909631995), Tl, VFNMS(LDK(KP989821441), Tm, VMUL(LDK(KP281732556), Tk))))));
Chris@42 235 Th = VFMA(LDK(KP841253532), Ta, VFMA(LDK(KP415415013), Tg, VFNMS(LDK(KP959492973), Td, VFNMS(LDK(KP142314838), T7, VFNMS(LDK(KP654860733), T4, T1)))));
Chris@42 236 ST(&(xo[WS(os, 7)]), VSUB(Th, Tn), ovs, &(xo[WS(os, 1)]));
Chris@42 237 ST(&(xo[WS(os, 4)]), VADD(Th, Tn), ovs, &(xo[0]));
Chris@42 238 Tv = VBYI(VFMA(LDK(KP281732556), Ti, VFMA(LDK(KP755749574), Tj, VFNMS(LDK(KP909631995), Tk, VFNMS(LDK(KP540640817), Tm, VMUL(LDK(KP989821441), Tl))))));
Chris@42 239 Tu = VFMA(LDK(KP841253532), T7, VFMA(LDK(KP415415013), Td, VFNMS(LDK(KP142314838), Tg, VFNMS(LDK(KP654860733), Ta, VFNMS(LDK(KP959492973), T4, T1)))));
Chris@42 240 ST(&(xo[WS(os, 6)]), VSUB(Tu, Tv), ovs, &(xo[0]));
Chris@42 241 ST(&(xo[WS(os, 5)]), VADD(Tu, Tv), ovs, &(xo[WS(os, 1)]));
Chris@42 242 }
Chris@42 243 Tt = VBYI(VFMA(LDK(KP989821441), Ti, VFMA(LDK(KP540640817), Tk, VFNMS(LDK(KP909631995), Tj, VFNMS(LDK(KP281732556), Tm, VMUL(LDK(KP755749574), Tl))))));
Chris@42 244 Ts = VFMA(LDK(KP415415013), Ta, VFMA(LDK(KP841253532), Td, VFNMS(LDK(KP654860733), Tg, VFNMS(LDK(KP959492973), T7, VFNMS(LDK(KP142314838), T4, T1)))));
Chris@42 245 ST(&(xo[WS(os, 8)]), VSUB(Ts, Tt), ovs, &(xo[0]));
Chris@42 246 ST(&(xo[WS(os, 3)]), VADD(Ts, Tt), ovs, &(xo[WS(os, 1)]));
Chris@42 247 {
Chris@42 248 V Tr, Tq, Tp, To;
Chris@42 249 Tr = VBYI(VFMA(LDK(KP540640817), Ti, VFMA(LDK(KP909631995), Tm, VFMA(LDK(KP989821441), Tj, VFMA(LDK(KP755749574), Tk, VMUL(LDK(KP281732556), Tl))))));
Chris@42 250 Tq = VFMA(LDK(KP841253532), T4, VFMA(LDK(KP415415013), T7, VFNMS(LDK(KP959492973), Tg, VFNMS(LDK(KP654860733), Td, VFNMS(LDK(KP142314838), Ta, T1)))));
Chris@42 251 ST(&(xo[WS(os, 10)]), VSUB(Tq, Tr), ovs, &(xo[0]));
Chris@42 252 ST(&(xo[WS(os, 1)]), VADD(Tq, Tr), ovs, &(xo[WS(os, 1)]));
Chris@42 253 Tp = VBYI(VFMA(LDK(KP909631995), Ti, VFNMS(LDK(KP540640817), Tl, VFNMS(LDK(KP989821441), Tk, VFNMS(LDK(KP281732556), Tj, VMUL(LDK(KP755749574), Tm))))));
Chris@42 254 To = VFMA(LDK(KP415415013), T4, VFMA(LDK(KP841253532), Tg, VFNMS(LDK(KP142314838), Td, VFNMS(LDK(KP959492973), Ta, VFNMS(LDK(KP654860733), T7, T1)))));
Chris@42 255 ST(&(xo[WS(os, 9)]), VSUB(To, Tp), ovs, &(xo[WS(os, 1)]));
Chris@42 256 ST(&(xo[WS(os, 2)]), VADD(To, Tp), ovs, &(xo[0]));
Chris@42 257 }
Chris@42 258 }
Chris@42 259 }
Chris@42 260 VLEAVE();
Chris@42 261 }
Chris@42 262
Chris@42 263 static const kdft_desc desc = { 11, XSIMD_STRING("n1fv_11"), {30, 10, 40, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 264
Chris@42 265 void XSIMD(codelet_n1fv_11) (planner *p) {
Chris@42 266 X(kdft_register) (p, n1fv_11, &desc);
Chris@42 267 }
Chris@42 268
Chris@42 269 #endif /* HAVE_FMA */