annotate src/fftw-3.3.8/dft/simd/common/n2fv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:07 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n2fv_10 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 42 FP additions, 22 FP multiplications,
Chris@82 32 * (or, 24 additions, 4 multiplications, 18 fused multiply/add),
Chris@82 33 * 36 stack variables, 4 constants, and 25 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2f.h"
Chris@82 36
Chris@82 37 static void n2fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 43 {
Chris@82 44 INT i;
Chris@82 45 const R *xi;
Chris@82 46 R *xo;
Chris@82 47 xi = ri;
Chris@82 48 xo = ro;
Chris@82 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
Chris@82 50 V T3, Tr, Tm, Tn, TD, TC, Tu, Tx, Ty, Ta, Th, Ti, T1, T2;
Chris@82 51 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 52 T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 53 T3 = VSUB(T1, T2);
Chris@82 54 Tr = VADD(T1, T2);
Chris@82 55 {
Chris@82 56 V T6, Ts, Tg, Tw, T9, Tt, Td, Tv;
Chris@82 57 {
Chris@82 58 V T4, T5, Te, Tf;
Chris@82 59 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 60 T5 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 61 T6 = VSUB(T4, T5);
Chris@82 62 Ts = VADD(T4, T5);
Chris@82 63 Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 64 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 65 Tg = VSUB(Te, Tf);
Chris@82 66 Tw = VADD(Te, Tf);
Chris@82 67 }
Chris@82 68 {
Chris@82 69 V T7, T8, Tb, Tc;
Chris@82 70 T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 71 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 72 T9 = VSUB(T7, T8);
Chris@82 73 Tt = VADD(T7, T8);
Chris@82 74 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 75 Tc = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 76 Td = VSUB(Tb, Tc);
Chris@82 77 Tv = VADD(Tb, Tc);
Chris@82 78 }
Chris@82 79 Tm = VSUB(T6, T9);
Chris@82 80 Tn = VSUB(Td, Tg);
Chris@82 81 TD = VSUB(Ts, Tt);
Chris@82 82 TC = VSUB(Tv, Tw);
Chris@82 83 Tu = VADD(Ts, Tt);
Chris@82 84 Tx = VADD(Tv, Tw);
Chris@82 85 Ty = VADD(Tu, Tx);
Chris@82 86 Ta = VADD(T6, T9);
Chris@82 87 Th = VADD(Td, Tg);
Chris@82 88 Ti = VADD(Ta, Th);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 V TH, TI, TK, TL, TM;
Chris@82 92 TH = VADD(T3, Ti);
Chris@82 93 STM2(&(xo[10]), TH, ovs, &(xo[2]));
Chris@82 94 TI = VADD(Tr, Ty);
Chris@82 95 STM2(&(xo[0]), TI, ovs, &(xo[0]));
Chris@82 96 {
Chris@82 97 V To, Tq, Tl, Tp, Tj, Tk, TJ;
Chris@82 98 To = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, Tm));
Chris@82 99 Tq = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tm, Tn));
Chris@82 100 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@82 101 Tk = VSUB(Ta, Th);
Chris@82 102 Tl = VFMA(LDK(KP559016994), Tk, Tj);
Chris@82 103 Tp = VFNMS(LDK(KP559016994), Tk, Tj);
Chris@82 104 TJ = VFNMSI(To, Tl);
Chris@82 105 STM2(&(xo[2]), TJ, ovs, &(xo[2]));
Chris@82 106 STN2(&(xo[0]), TI, TJ, ovs);
Chris@82 107 TK = VFMAI(Tq, Tp);
Chris@82 108 STM2(&(xo[14]), TK, ovs, &(xo[2]));
Chris@82 109 TL = VFMAI(To, Tl);
Chris@82 110 STM2(&(xo[18]), TL, ovs, &(xo[2]));
Chris@82 111 TM = VFNMSI(Tq, Tp);
Chris@82 112 STM2(&(xo[6]), TM, ovs, &(xo[2]));
Chris@82 113 }
Chris@82 114 {
Chris@82 115 V TE, TG, TB, TF, Tz, TA;
Chris@82 116 TE = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TD, TC));
Chris@82 117 TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TC, TD));
Chris@82 118 Tz = VFNMS(LDK(KP250000000), Ty, Tr);
Chris@82 119 TA = VSUB(Tu, Tx);
Chris@82 120 TB = VFNMS(LDK(KP559016994), TA, Tz);
Chris@82 121 TF = VFMA(LDK(KP559016994), TA, Tz);
Chris@82 122 {
Chris@82 123 V TN, TO, TP, TQ;
Chris@82 124 TN = VFMAI(TE, TB);
Chris@82 125 STM2(&(xo[4]), TN, ovs, &(xo[0]));
Chris@82 126 STN2(&(xo[4]), TN, TM, ovs);
Chris@82 127 TO = VFNMSI(TG, TF);
Chris@82 128 STM2(&(xo[12]), TO, ovs, &(xo[0]));
Chris@82 129 STN2(&(xo[12]), TO, TK, ovs);
Chris@82 130 TP = VFNMSI(TE, TB);
Chris@82 131 STM2(&(xo[16]), TP, ovs, &(xo[0]));
Chris@82 132 STN2(&(xo[16]), TP, TL, ovs);
Chris@82 133 TQ = VFMAI(TG, TF);
Chris@82 134 STM2(&(xo[8]), TQ, ovs, &(xo[0]));
Chris@82 135 STN2(&(xo[8]), TQ, TH, ovs);
Chris@82 136 }
Chris@82 137 }
Chris@82 138 }
Chris@82 139 }
Chris@82 140 }
Chris@82 141 VLEAVE();
Chris@82 142 }
Chris@82 143
Chris@82 144 static const kdft_desc desc = { 10, XSIMD_STRING("n2fv_10"), {24, 4, 18, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 145
Chris@82 146 void XSIMD(codelet_n2fv_10) (planner *p) {
Chris@82 147 X(kdft_register) (p, n2fv_10, &desc);
Chris@82 148 }
Chris@82 149
Chris@82 150 #else
Chris@82 151
Chris@82 152 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n2fv_10 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 153
Chris@82 154 /*
Chris@82 155 * This function contains 42 FP additions, 12 FP multiplications,
Chris@82 156 * (or, 36 additions, 6 multiplications, 6 fused multiply/add),
Chris@82 157 * 36 stack variables, 4 constants, and 25 memory accesses
Chris@82 158 */
Chris@82 159 #include "dft/simd/n2f.h"
Chris@82 160
Chris@82 161 static void n2fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 162 {
Chris@82 163 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 164 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 165 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 166 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 167 {
Chris@82 168 INT i;
Chris@82 169 const R *xi;
Chris@82 170 R *xo;
Chris@82 171 xi = ri;
Chris@82 172 xo = ro;
Chris@82 173 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
Chris@82 174 V Ti, Ty, Tm, Tn, Tw, Tt, Tz, TA, TB, T7, Te, Tj, Tg, Th;
Chris@82 175 Tg = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 176 Th = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 177 Ti = VSUB(Tg, Th);
Chris@82 178 Ty = VADD(Tg, Th);
Chris@82 179 {
Chris@82 180 V T3, Tu, Td, Ts, T6, Tv, Ta, Tr;
Chris@82 181 {
Chris@82 182 V T1, T2, Tb, Tc;
Chris@82 183 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 184 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 185 T3 = VSUB(T1, T2);
Chris@82 186 Tu = VADD(T1, T2);
Chris@82 187 Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 188 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 189 Td = VSUB(Tb, Tc);
Chris@82 190 Ts = VADD(Tb, Tc);
Chris@82 191 }
Chris@82 192 {
Chris@82 193 V T4, T5, T8, T9;
Chris@82 194 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 195 T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 196 T6 = VSUB(T4, T5);
Chris@82 197 Tv = VADD(T4, T5);
Chris@82 198 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 199 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 200 Ta = VSUB(T8, T9);
Chris@82 201 Tr = VADD(T8, T9);
Chris@82 202 }
Chris@82 203 Tm = VSUB(T3, T6);
Chris@82 204 Tn = VSUB(Ta, Td);
Chris@82 205 Tw = VSUB(Tu, Tv);
Chris@82 206 Tt = VSUB(Tr, Ts);
Chris@82 207 Tz = VADD(Tu, Tv);
Chris@82 208 TA = VADD(Tr, Ts);
Chris@82 209 TB = VADD(Tz, TA);
Chris@82 210 T7 = VADD(T3, T6);
Chris@82 211 Te = VADD(Ta, Td);
Chris@82 212 Tj = VADD(T7, Te);
Chris@82 213 }
Chris@82 214 {
Chris@82 215 V TH, TI, TK, TL, TM;
Chris@82 216 TH = VADD(Ti, Tj);
Chris@82 217 STM2(&(xo[10]), TH, ovs, &(xo[2]));
Chris@82 218 TI = VADD(Ty, TB);
Chris@82 219 STM2(&(xo[0]), TI, ovs, &(xo[0]));
Chris@82 220 {
Chris@82 221 V To, Tq, Tl, Tp, Tf, Tk, TJ;
Chris@82 222 To = VBYI(VFMA(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tn)));
Chris@82 223 Tq = VBYI(VFNMS(LDK(KP587785252), Tm, VMUL(LDK(KP951056516), Tn)));
Chris@82 224 Tf = VMUL(LDK(KP559016994), VSUB(T7, Te));
Chris@82 225 Tk = VFNMS(LDK(KP250000000), Tj, Ti);
Chris@82 226 Tl = VADD(Tf, Tk);
Chris@82 227 Tp = VSUB(Tk, Tf);
Chris@82 228 TJ = VSUB(Tl, To);
Chris@82 229 STM2(&(xo[2]), TJ, ovs, &(xo[2]));
Chris@82 230 STN2(&(xo[0]), TI, TJ, ovs);
Chris@82 231 TK = VADD(Tq, Tp);
Chris@82 232 STM2(&(xo[14]), TK, ovs, &(xo[2]));
Chris@82 233 TL = VADD(To, Tl);
Chris@82 234 STM2(&(xo[18]), TL, ovs, &(xo[2]));
Chris@82 235 TM = VSUB(Tp, Tq);
Chris@82 236 STM2(&(xo[6]), TM, ovs, &(xo[2]));
Chris@82 237 }
Chris@82 238 {
Chris@82 239 V Tx, TF, TE, TG, TC, TD;
Chris@82 240 Tx = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tt)));
Chris@82 241 TF = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt)));
Chris@82 242 TC = VFNMS(LDK(KP250000000), TB, Ty);
Chris@82 243 TD = VMUL(LDK(KP559016994), VSUB(Tz, TA));
Chris@82 244 TE = VSUB(TC, TD);
Chris@82 245 TG = VADD(TD, TC);
Chris@82 246 {
Chris@82 247 V TN, TO, TP, TQ;
Chris@82 248 TN = VADD(Tx, TE);
Chris@82 249 STM2(&(xo[4]), TN, ovs, &(xo[0]));
Chris@82 250 STN2(&(xo[4]), TN, TM, ovs);
Chris@82 251 TO = VSUB(TG, TF);
Chris@82 252 STM2(&(xo[12]), TO, ovs, &(xo[0]));
Chris@82 253 STN2(&(xo[12]), TO, TK, ovs);
Chris@82 254 TP = VSUB(TE, Tx);
Chris@82 255 STM2(&(xo[16]), TP, ovs, &(xo[0]));
Chris@82 256 STN2(&(xo[16]), TP, TL, ovs);
Chris@82 257 TQ = VADD(TF, TG);
Chris@82 258 STM2(&(xo[8]), TQ, ovs, &(xo[0]));
Chris@82 259 STN2(&(xo[8]), TQ, TH, ovs);
Chris@82 260 }
Chris@82 261 }
Chris@82 262 }
Chris@82 263 }
Chris@82 264 }
Chris@82 265 VLEAVE();
Chris@82 266 }
Chris@82 267
Chris@82 268 static const kdft_desc desc = { 10, XSIMD_STRING("n2fv_10"), {36, 6, 6, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 269
Chris@82 270 void XSIMD(codelet_n2fv_10) (planner *p) {
Chris@82 271 X(kdft_register) (p, n2fv_10, &desc);
Chris@82 272 }
Chris@82 273
Chris@82 274 #endif