annotate src/fftw-3.3.5/dft/simd/common/n1fv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:38:39 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n1fv_10 -include n1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 42 FP additions, 22 FP multiplications,
Chris@42 32 * (or, 24 additions, 4 multiplications, 18 fused multiply/add),
Chris@42 33 * 43 stack variables, 4 constants, and 20 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1f.h"
Chris@42 36
Chris@42 37 static void n1fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 43 {
Chris@42 44 INT i;
Chris@42 45 const R *xi;
Chris@42 46 R *xo;
Chris@42 47 xi = ri;
Chris@42 48 xo = ro;
Chris@42 49 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
Chris@42 50 V Tb, Tr, T3, Ts, T6, Tw, Tg, Tt, T9, Tc, T1, T2;
Chris@42 51 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 52 T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 53 {
Chris@42 54 V T4, T5, Te, Tf, T7, T8;
Chris@42 55 T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 56 T5 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 57 Te = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 58 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 59 T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 60 T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 61 Tb = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 62 Tr = VADD(T1, T2);
Chris@42 63 T3 = VSUB(T1, T2);
Chris@42 64 Ts = VADD(T4, T5);
Chris@42 65 T6 = VSUB(T4, T5);
Chris@42 66 Tw = VADD(Te, Tf);
Chris@42 67 Tg = VSUB(Te, Tf);
Chris@42 68 Tt = VADD(T7, T8);
Chris@42 69 T9 = VSUB(T7, T8);
Chris@42 70 Tc = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 71 }
Chris@42 72 {
Chris@42 73 V TD, Tu, Tm, Ta, Td, Tv;
Chris@42 74 TD = VSUB(Ts, Tt);
Chris@42 75 Tu = VADD(Ts, Tt);
Chris@42 76 Tm = VSUB(T6, T9);
Chris@42 77 Ta = VADD(T6, T9);
Chris@42 78 Td = VSUB(Tb, Tc);
Chris@42 79 Tv = VADD(Tb, Tc);
Chris@42 80 {
Chris@42 81 V TC, Tx, Tn, Th;
Chris@42 82 TC = VSUB(Tv, Tw);
Chris@42 83 Tx = VADD(Tv, Tw);
Chris@42 84 Tn = VSUB(Td, Tg);
Chris@42 85 Th = VADD(Td, Tg);
Chris@42 86 {
Chris@42 87 V Ty, TA, TE, TG, Ti, Tk, To, Tq, Tz, Tj;
Chris@42 88 Ty = VADD(Tu, Tx);
Chris@42 89 TA = VSUB(Tu, Tx);
Chris@42 90 TE = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TD, TC));
Chris@42 91 TG = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TC, TD));
Chris@42 92 Ti = VADD(Ta, Th);
Chris@42 93 Tk = VSUB(Ta, Th);
Chris@42 94 To = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, Tm));
Chris@42 95 Tq = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tm, Tn));
Chris@42 96 Tz = VFNMS(LDK(KP250000000), Ty, Tr);
Chris@42 97 ST(&(xo[0]), VADD(Tr, Ty), ovs, &(xo[0]));
Chris@42 98 Tj = VFNMS(LDK(KP250000000), Ti, T3);
Chris@42 99 ST(&(xo[WS(os, 5)]), VADD(T3, Ti), ovs, &(xo[WS(os, 1)]));
Chris@42 100 {
Chris@42 101 V TB, TF, Tl, Tp;
Chris@42 102 TB = VFNMS(LDK(KP559016994), TA, Tz);
Chris@42 103 TF = VFMA(LDK(KP559016994), TA, Tz);
Chris@42 104 Tl = VFMA(LDK(KP559016994), Tk, Tj);
Chris@42 105 Tp = VFNMS(LDK(KP559016994), Tk, Tj);
Chris@42 106 ST(&(xo[WS(os, 4)]), VFMAI(TG, TF), ovs, &(xo[0]));
Chris@42 107 ST(&(xo[WS(os, 6)]), VFNMSI(TG, TF), ovs, &(xo[0]));
Chris@42 108 ST(&(xo[WS(os, 8)]), VFNMSI(TE, TB), ovs, &(xo[0]));
Chris@42 109 ST(&(xo[WS(os, 2)]), VFMAI(TE, TB), ovs, &(xo[0]));
Chris@42 110 ST(&(xo[WS(os, 3)]), VFNMSI(Tq, Tp), ovs, &(xo[WS(os, 1)]));
Chris@42 111 ST(&(xo[WS(os, 7)]), VFMAI(Tq, Tp), ovs, &(xo[WS(os, 1)]));
Chris@42 112 ST(&(xo[WS(os, 9)]), VFMAI(To, Tl), ovs, &(xo[WS(os, 1)]));
Chris@42 113 ST(&(xo[WS(os, 1)]), VFNMSI(To, Tl), ovs, &(xo[WS(os, 1)]));
Chris@42 114 }
Chris@42 115 }
Chris@42 116 }
Chris@42 117 }
Chris@42 118 }
Chris@42 119 }
Chris@42 120 VLEAVE();
Chris@42 121 }
Chris@42 122
Chris@42 123 static const kdft_desc desc = { 10, XSIMD_STRING("n1fv_10"), {24, 4, 18, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 124
Chris@42 125 void XSIMD(codelet_n1fv_10) (planner *p) {
Chris@42 126 X(kdft_register) (p, n1fv_10, &desc);
Chris@42 127 }
Chris@42 128
Chris@42 129 #else /* HAVE_FMA */
Chris@42 130
Chris@42 131 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 10 -name n1fv_10 -include n1f.h */
Chris@42 132
Chris@42 133 /*
Chris@42 134 * This function contains 42 FP additions, 12 FP multiplications,
Chris@42 135 * (or, 36 additions, 6 multiplications, 6 fused multiply/add),
Chris@42 136 * 33 stack variables, 4 constants, and 20 memory accesses
Chris@42 137 */
Chris@42 138 #include "n1f.h"
Chris@42 139
Chris@42 140 static void n1fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 141 {
Chris@42 142 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 143 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 144 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 145 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 146 {
Chris@42 147 INT i;
Chris@42 148 const R *xi;
Chris@42 149 R *xo;
Chris@42 150 xi = ri;
Chris@42 151 xo = ro;
Chris@42 152 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
Chris@42 153 V Ti, Ty, Tm, Tn, Tw, Tt, Tz, TA, TB, T7, Te, Tj, Tg, Th;
Chris@42 154 Tg = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 155 Th = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 156 Ti = VSUB(Tg, Th);
Chris@42 157 Ty = VADD(Tg, Th);
Chris@42 158 {
Chris@42 159 V T3, Tu, Td, Ts, T6, Tv, Ta, Tr;
Chris@42 160 {
Chris@42 161 V T1, T2, Tb, Tc;
Chris@42 162 T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 163 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 164 T3 = VSUB(T1, T2);
Chris@42 165 Tu = VADD(T1, T2);
Chris@42 166 Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 167 Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 168 Td = VSUB(Tb, Tc);
Chris@42 169 Ts = VADD(Tb, Tc);
Chris@42 170 }
Chris@42 171 {
Chris@42 172 V T4, T5, T8, T9;
Chris@42 173 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 174 T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 175 T6 = VSUB(T4, T5);
Chris@42 176 Tv = VADD(T4, T5);
Chris@42 177 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 178 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 179 Ta = VSUB(T8, T9);
Chris@42 180 Tr = VADD(T8, T9);
Chris@42 181 }
Chris@42 182 Tm = VSUB(T3, T6);
Chris@42 183 Tn = VSUB(Ta, Td);
Chris@42 184 Tw = VSUB(Tu, Tv);
Chris@42 185 Tt = VSUB(Tr, Ts);
Chris@42 186 Tz = VADD(Tu, Tv);
Chris@42 187 TA = VADD(Tr, Ts);
Chris@42 188 TB = VADD(Tz, TA);
Chris@42 189 T7 = VADD(T3, T6);
Chris@42 190 Te = VADD(Ta, Td);
Chris@42 191 Tj = VADD(T7, Te);
Chris@42 192 }
Chris@42 193 ST(&(xo[WS(os, 5)]), VADD(Ti, Tj), ovs, &(xo[WS(os, 1)]));
Chris@42 194 ST(&(xo[0]), VADD(Ty, TB), ovs, &(xo[0]));
Chris@42 195 {
Chris@42 196 V To, Tq, Tl, Tp, Tf, Tk;
Chris@42 197 To = VBYI(VFMA(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tn)));
Chris@42 198 Tq = VBYI(VFNMS(LDK(KP587785252), Tm, VMUL(LDK(KP951056516), Tn)));
Chris@42 199 Tf = VMUL(LDK(KP559016994), VSUB(T7, Te));
Chris@42 200 Tk = VFNMS(LDK(KP250000000), Tj, Ti);
Chris@42 201 Tl = VADD(Tf, Tk);
Chris@42 202 Tp = VSUB(Tk, Tf);
Chris@42 203 ST(&(xo[WS(os, 1)]), VSUB(Tl, To), ovs, &(xo[WS(os, 1)]));
Chris@42 204 ST(&(xo[WS(os, 7)]), VADD(Tq, Tp), ovs, &(xo[WS(os, 1)]));
Chris@42 205 ST(&(xo[WS(os, 9)]), VADD(To, Tl), ovs, &(xo[WS(os, 1)]));
Chris@42 206 ST(&(xo[WS(os, 3)]), VSUB(Tp, Tq), ovs, &(xo[WS(os, 1)]));
Chris@42 207 }
Chris@42 208 {
Chris@42 209 V Tx, TF, TE, TG, TC, TD;
Chris@42 210 Tx = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tt)));
Chris@42 211 TF = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt)));
Chris@42 212 TC = VFNMS(LDK(KP250000000), TB, Ty);
Chris@42 213 TD = VMUL(LDK(KP559016994), VSUB(Tz, TA));
Chris@42 214 TE = VSUB(TC, TD);
Chris@42 215 TG = VADD(TD, TC);
Chris@42 216 ST(&(xo[WS(os, 2)]), VADD(Tx, TE), ovs, &(xo[0]));
Chris@42 217 ST(&(xo[WS(os, 6)]), VSUB(TG, TF), ovs, &(xo[0]));
Chris@42 218 ST(&(xo[WS(os, 8)]), VSUB(TE, Tx), ovs, &(xo[0]));
Chris@42 219 ST(&(xo[WS(os, 4)]), VADD(TF, TG), ovs, &(xo[0]));
Chris@42 220 }
Chris@42 221 }
Chris@42 222 }
Chris@42 223 VLEAVE();
Chris@42 224 }
Chris@42 225
Chris@42 226 static const kdft_desc desc = { 10, XSIMD_STRING("n1fv_10"), {36, 6, 6, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 227
Chris@42 228 void XSIMD(codelet_n1fv_10) (planner *p) {
Chris@42 229 X(kdft_register) (p, n1fv_10, &desc);
Chris@42 230 }
Chris@42 231
Chris@42 232 #endif /* HAVE_FMA */