annotate src/fftw-3.3.5/dft/simd/common/n1fv_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:38:40 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n1fv_12 -include n1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 48 FP additions, 20 FP multiplications,
Chris@42 32 * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
Chris@42 33 * 49 stack variables, 2 constants, and 24 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1f.h"
Chris@42 36
Chris@42 37 static void n1fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT i;
Chris@42 43 const R *xi;
Chris@42 44 R *xo;
Chris@42 45 xi = ri;
Chris@42 46 xo = ro;
Chris@42 47 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@42 48 V T1, T6, Tk, Tn, Tc, Td, Tf, Tr, T4, Ts, T9, Tg, Te, Tl;
Chris@42 49 {
Chris@42 50 V T2, T3, T7, T8;
Chris@42 51 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 52 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 53 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 54 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 55 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 56 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 57 Tk = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 58 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 59 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 60 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 61 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 62 Tr = VSUB(T3, T2);
Chris@42 63 T4 = VADD(T2, T3);
Chris@42 64 Ts = VSUB(T8, T7);
Chris@42 65 T9 = VADD(T7, T8);
Chris@42 66 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 67 }
Chris@42 68 Te = VSUB(Tc, Td);
Chris@42 69 Tl = VADD(Td, Tc);
Chris@42 70 {
Chris@42 71 V T5, TF, TB, Tt, Ta, TG, Th, To, Tm, TI;
Chris@42 72 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 73 TF = VADD(T1, T4);
Chris@42 74 TB = VADD(Tr, Ts);
Chris@42 75 Tt = VSUB(Tr, Ts);
Chris@42 76 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 77 TG = VADD(T6, T9);
Chris@42 78 Th = VSUB(Tf, Tg);
Chris@42 79 To = VADD(Tf, Tg);
Chris@42 80 Tm = VFNMS(LDK(KP500000000), Tl, Tk);
Chris@42 81 TI = VADD(Tk, Tl);
Chris@42 82 {
Chris@42 83 V TH, TL, Tb, Tx, TJ, Tp, Ti, TA;
Chris@42 84 TH = VSUB(TF, TG);
Chris@42 85 TL = VADD(TF, TG);
Chris@42 86 Tb = VSUB(T5, Ta);
Chris@42 87 Tx = VADD(T5, Ta);
Chris@42 88 TJ = VADD(Tn, To);
Chris@42 89 Tp = VFNMS(LDK(KP500000000), To, Tn);
Chris@42 90 Ti = VADD(Te, Th);
Chris@42 91 TA = VSUB(Te, Th);
Chris@42 92 {
Chris@42 93 V Tq, Ty, TK, TM;
Chris@42 94 Tq = VSUB(Tm, Tp);
Chris@42 95 Ty = VADD(Tm, Tp);
Chris@42 96 TK = VSUB(TI, TJ);
Chris@42 97 TM = VADD(TI, TJ);
Chris@42 98 {
Chris@42 99 V TC, TE, Tj, Tv;
Chris@42 100 TC = VMUL(LDK(KP866025403), VSUB(TA, TB));
Chris@42 101 TE = VMUL(LDK(KP866025403), VADD(TB, TA));
Chris@42 102 Tj = VFMA(LDK(KP866025403), Ti, Tb);
Chris@42 103 Tv = VFNMS(LDK(KP866025403), Ti, Tb);
Chris@42 104 {
Chris@42 105 V Tz, TD, Tu, Tw;
Chris@42 106 Tz = VSUB(Tx, Ty);
Chris@42 107 TD = VADD(Tx, Ty);
Chris@42 108 Tu = VFNMS(LDK(KP866025403), Tt, Tq);
Chris@42 109 Tw = VFMA(LDK(KP866025403), Tt, Tq);
Chris@42 110 ST(&(xo[0]), VADD(TL, TM), ovs, &(xo[0]));
Chris@42 111 ST(&(xo[WS(os, 6)]), VSUB(TL, TM), ovs, &(xo[0]));
Chris@42 112 ST(&(xo[WS(os, 3)]), VFMAI(TK, TH), ovs, &(xo[WS(os, 1)]));
Chris@42 113 ST(&(xo[WS(os, 9)]), VFNMSI(TK, TH), ovs, &(xo[WS(os, 1)]));
Chris@42 114 ST(&(xo[WS(os, 4)]), VFMAI(TE, TD), ovs, &(xo[0]));
Chris@42 115 ST(&(xo[WS(os, 8)]), VFNMSI(TE, TD), ovs, &(xo[0]));
Chris@42 116 ST(&(xo[WS(os, 10)]), VFNMSI(TC, Tz), ovs, &(xo[0]));
Chris@42 117 ST(&(xo[WS(os, 2)]), VFMAI(TC, Tz), ovs, &(xo[0]));
Chris@42 118 ST(&(xo[WS(os, 5)]), VFNMSI(Tw, Tv), ovs, &(xo[WS(os, 1)]));
Chris@42 119 ST(&(xo[WS(os, 7)]), VFMAI(Tw, Tv), ovs, &(xo[WS(os, 1)]));
Chris@42 120 ST(&(xo[WS(os, 11)]), VFMAI(Tu, Tj), ovs, &(xo[WS(os, 1)]));
Chris@42 121 ST(&(xo[WS(os, 1)]), VFNMSI(Tu, Tj), ovs, &(xo[WS(os, 1)]));
Chris@42 122 }
Chris@42 123 }
Chris@42 124 }
Chris@42 125 }
Chris@42 126 }
Chris@42 127 }
Chris@42 128 }
Chris@42 129 VLEAVE();
Chris@42 130 }
Chris@42 131
Chris@42 132 static const kdft_desc desc = { 12, XSIMD_STRING("n1fv_12"), {30, 2, 18, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 133
Chris@42 134 void XSIMD(codelet_n1fv_12) (planner *p) {
Chris@42 135 X(kdft_register) (p, n1fv_12, &desc);
Chris@42 136 }
Chris@42 137
Chris@42 138 #else /* HAVE_FMA */
Chris@42 139
Chris@42 140 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n1fv_12 -include n1f.h */
Chris@42 141
Chris@42 142 /*
Chris@42 143 * This function contains 48 FP additions, 8 FP multiplications,
Chris@42 144 * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
Chris@42 145 * 27 stack variables, 2 constants, and 24 memory accesses
Chris@42 146 */
Chris@42 147 #include "n1f.h"
Chris@42 148
Chris@42 149 static void n1fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 150 {
Chris@42 151 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 152 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 153 {
Chris@42 154 INT i;
Chris@42 155 const R *xi;
Chris@42 156 R *xo;
Chris@42 157 xi = ri;
Chris@42 158 xo = ro;
Chris@42 159 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@42 160 V T5, Ta, TJ, Ty, Tq, Tp, Tg, Tl, TI, TA, Tz, Tu;
Chris@42 161 {
Chris@42 162 V T1, T6, T4, Tw, T9, Tx;
Chris@42 163 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 164 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 165 {
Chris@42 166 V T2, T3, T7, T8;
Chris@42 167 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 168 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 169 T4 = VADD(T2, T3);
Chris@42 170 Tw = VSUB(T3, T2);
Chris@42 171 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 172 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 173 T9 = VADD(T7, T8);
Chris@42 174 Tx = VSUB(T8, T7);
Chris@42 175 }
Chris@42 176 T5 = VADD(T1, T4);
Chris@42 177 Ta = VADD(T6, T9);
Chris@42 178 TJ = VADD(Tw, Tx);
Chris@42 179 Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx));
Chris@42 180 Tq = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 181 Tp = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 182 }
Chris@42 183 {
Chris@42 184 V Tc, Th, Tf, Ts, Tk, Tt;
Chris@42 185 Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 186 Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 187 {
Chris@42 188 V Td, Te, Ti, Tj;
Chris@42 189 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 190 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 191 Tf = VADD(Td, Te);
Chris@42 192 Ts = VSUB(Te, Td);
Chris@42 193 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 194 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 195 Tk = VADD(Ti, Tj);
Chris@42 196 Tt = VSUB(Tj, Ti);
Chris@42 197 }
Chris@42 198 Tg = VADD(Tc, Tf);
Chris@42 199 Tl = VADD(Th, Tk);
Chris@42 200 TI = VADD(Ts, Tt);
Chris@42 201 TA = VFNMS(LDK(KP500000000), Tk, Th);
Chris@42 202 Tz = VFNMS(LDK(KP500000000), Tf, Tc);
Chris@42 203 Tu = VMUL(LDK(KP866025403), VSUB(Ts, Tt));
Chris@42 204 }
Chris@42 205 {
Chris@42 206 V Tb, Tm, Tn, To;
Chris@42 207 Tb = VSUB(T5, Ta);
Chris@42 208 Tm = VBYI(VSUB(Tg, Tl));
Chris@42 209 ST(&(xo[WS(os, 9)]), VSUB(Tb, Tm), ovs, &(xo[WS(os, 1)]));
Chris@42 210 ST(&(xo[WS(os, 3)]), VADD(Tb, Tm), ovs, &(xo[WS(os, 1)]));
Chris@42 211 Tn = VADD(T5, Ta);
Chris@42 212 To = VADD(Tg, Tl);
Chris@42 213 ST(&(xo[WS(os, 6)]), VSUB(Tn, To), ovs, &(xo[0]));
Chris@42 214 ST(&(xo[0]), VADD(Tn, To), ovs, &(xo[0]));
Chris@42 215 }
Chris@42 216 {
Chris@42 217 V Tv, TE, TC, TD, Tr, TB;
Chris@42 218 Tr = VSUB(Tp, Tq);
Chris@42 219 Tv = VSUB(Tr, Tu);
Chris@42 220 TE = VADD(Tr, Tu);
Chris@42 221 TB = VSUB(Tz, TA);
Chris@42 222 TC = VBYI(VADD(Ty, TB));
Chris@42 223 TD = VBYI(VSUB(Ty, TB));
Chris@42 224 ST(&(xo[WS(os, 5)]), VSUB(Tv, TC), ovs, &(xo[WS(os, 1)]));
Chris@42 225 ST(&(xo[WS(os, 11)]), VSUB(TE, TD), ovs, &(xo[WS(os, 1)]));
Chris@42 226 ST(&(xo[WS(os, 7)]), VADD(TC, Tv), ovs, &(xo[WS(os, 1)]));
Chris@42 227 ST(&(xo[WS(os, 1)]), VADD(TD, TE), ovs, &(xo[WS(os, 1)]));
Chris@42 228 }
Chris@42 229 {
Chris@42 230 V TK, TM, TH, TL, TF, TG;
Chris@42 231 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
Chris@42 232 TM = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
Chris@42 233 TF = VADD(Tp, Tq);
Chris@42 234 TG = VADD(Tz, TA);
Chris@42 235 TH = VSUB(TF, TG);
Chris@42 236 TL = VADD(TF, TG);
Chris@42 237 ST(&(xo[WS(os, 10)]), VSUB(TH, TK), ovs, &(xo[0]));
Chris@42 238 ST(&(xo[WS(os, 4)]), VADD(TL, TM), ovs, &(xo[0]));
Chris@42 239 ST(&(xo[WS(os, 2)]), VADD(TH, TK), ovs, &(xo[0]));
Chris@42 240 ST(&(xo[WS(os, 8)]), VSUB(TL, TM), ovs, &(xo[0]));
Chris@42 241 }
Chris@42 242 }
Chris@42 243 }
Chris@42 244 VLEAVE();
Chris@42 245 }
Chris@42 246
Chris@42 247 static const kdft_desc desc = { 12, XSIMD_STRING("n1fv_12"), {44, 4, 4, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 248
Chris@42 249 void XSIMD(codelet_n1fv_12) (planner *p) {
Chris@42 250 X(kdft_register) (p, n1fv_12, &desc);
Chris@42 251 }
Chris@42 252
Chris@42 253 #endif /* HAVE_FMA */