annotate src/fftw-3.3.8/dft/simd/common/n1fv_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:51 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n1fv_12 -include dft/simd/n1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 48 FP additions, 20 FP multiplications,
Chris@82 32 * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
Chris@82 33 * 27 stack variables, 2 constants, and 24 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1f.h"
Chris@82 36
Chris@82 37 static void n1fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT i;
Chris@82 43 const R *xi;
Chris@82 44 R *xo;
Chris@82 45 xi = ri;
Chris@82 46 xo = ro;
Chris@82 47 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@82 48 V T5, Ta, TG, TF, TB, Tt, Ti, Tm, TJ, TI, TA, Tp;
Chris@82 49 {
Chris@82 50 V T1, T6, T4, Tr, T9, Ts;
Chris@82 51 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 52 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 53 {
Chris@82 54 V T2, T3, T7, T8;
Chris@82 55 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 56 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 57 T4 = VADD(T2, T3);
Chris@82 58 Tr = VSUB(T3, T2);
Chris@82 59 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 60 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 61 T9 = VADD(T7, T8);
Chris@82 62 Ts = VSUB(T8, T7);
Chris@82 63 }
Chris@82 64 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@82 65 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@82 66 TG = VADD(T6, T9);
Chris@82 67 TF = VADD(T1, T4);
Chris@82 68 TB = VADD(Tr, Ts);
Chris@82 69 Tt = VSUB(Tr, Ts);
Chris@82 70 }
Chris@82 71 {
Chris@82 72 V Tk, Tn, Te, Tl, Th, To;
Chris@82 73 Tk = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 74 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 75 {
Chris@82 76 V Tc, Td, Tf, Tg;
Chris@82 77 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 78 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 79 Te = VSUB(Tc, Td);
Chris@82 80 Tl = VADD(Td, Tc);
Chris@82 81 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 82 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 83 Th = VSUB(Tf, Tg);
Chris@82 84 To = VADD(Tf, Tg);
Chris@82 85 }
Chris@82 86 Ti = VADD(Te, Th);
Chris@82 87 Tm = VFNMS(LDK(KP500000000), Tl, Tk);
Chris@82 88 TJ = VADD(Tn, To);
Chris@82 89 TI = VADD(Tk, Tl);
Chris@82 90 TA = VSUB(Te, Th);
Chris@82 91 Tp = VFNMS(LDK(KP500000000), To, Tn);
Chris@82 92 }
Chris@82 93 {
Chris@82 94 V TH, TK, TL, TM;
Chris@82 95 TH = VSUB(TF, TG);
Chris@82 96 TK = VSUB(TI, TJ);
Chris@82 97 ST(&(xo[WS(os, 9)]), VFNMSI(TK, TH), ovs, &(xo[WS(os, 1)]));
Chris@82 98 ST(&(xo[WS(os, 3)]), VFMAI(TK, TH), ovs, &(xo[WS(os, 1)]));
Chris@82 99 TL = VADD(TF, TG);
Chris@82 100 TM = VADD(TI, TJ);
Chris@82 101 ST(&(xo[WS(os, 6)]), VSUB(TL, TM), ovs, &(xo[0]));
Chris@82 102 ST(&(xo[0]), VADD(TL, TM), ovs, &(xo[0]));
Chris@82 103 }
Chris@82 104 {
Chris@82 105 V Tj, Tv, Tu, Tw, Tb, Tq;
Chris@82 106 Tb = VSUB(T5, Ta);
Chris@82 107 Tj = VFMA(LDK(KP866025403), Ti, Tb);
Chris@82 108 Tv = VFNMS(LDK(KP866025403), Ti, Tb);
Chris@82 109 Tq = VSUB(Tm, Tp);
Chris@82 110 Tu = VFNMS(LDK(KP866025403), Tt, Tq);
Chris@82 111 Tw = VFMA(LDK(KP866025403), Tt, Tq);
Chris@82 112 ST(&(xo[WS(os, 1)]), VFNMSI(Tu, Tj), ovs, &(xo[WS(os, 1)]));
Chris@82 113 ST(&(xo[WS(os, 7)]), VFMAI(Tw, Tv), ovs, &(xo[WS(os, 1)]));
Chris@82 114 ST(&(xo[WS(os, 11)]), VFMAI(Tu, Tj), ovs, &(xo[WS(os, 1)]));
Chris@82 115 ST(&(xo[WS(os, 5)]), VFNMSI(Tw, Tv), ovs, &(xo[WS(os, 1)]));
Chris@82 116 }
Chris@82 117 {
Chris@82 118 V TC, TE, Tz, TD, Tx, Ty;
Chris@82 119 TC = VMUL(LDK(KP866025403), VSUB(TA, TB));
Chris@82 120 TE = VMUL(LDK(KP866025403), VADD(TB, TA));
Chris@82 121 Tx = VADD(T5, Ta);
Chris@82 122 Ty = VADD(Tm, Tp);
Chris@82 123 Tz = VSUB(Tx, Ty);
Chris@82 124 TD = VADD(Tx, Ty);
Chris@82 125 ST(&(xo[WS(os, 2)]), VFMAI(TC, Tz), ovs, &(xo[0]));
Chris@82 126 ST(&(xo[WS(os, 8)]), VFNMSI(TE, TD), ovs, &(xo[0]));
Chris@82 127 ST(&(xo[WS(os, 10)]), VFNMSI(TC, Tz), ovs, &(xo[0]));
Chris@82 128 ST(&(xo[WS(os, 4)]), VFMAI(TE, TD), ovs, &(xo[0]));
Chris@82 129 }
Chris@82 130 }
Chris@82 131 }
Chris@82 132 VLEAVE();
Chris@82 133 }
Chris@82 134
Chris@82 135 static const kdft_desc desc = { 12, XSIMD_STRING("n1fv_12"), {30, 2, 18, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 136
Chris@82 137 void XSIMD(codelet_n1fv_12) (planner *p) {
Chris@82 138 X(kdft_register) (p, n1fv_12, &desc);
Chris@82 139 }
Chris@82 140
Chris@82 141 #else
Chris@82 142
Chris@82 143 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n1fv_12 -include dft/simd/n1f.h */
Chris@82 144
Chris@82 145 /*
Chris@82 146 * This function contains 48 FP additions, 8 FP multiplications,
Chris@82 147 * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
Chris@82 148 * 27 stack variables, 2 constants, and 24 memory accesses
Chris@82 149 */
Chris@82 150 #include "dft/simd/n1f.h"
Chris@82 151
Chris@82 152 static void n1fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 153 {
Chris@82 154 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 155 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 156 {
Chris@82 157 INT i;
Chris@82 158 const R *xi;
Chris@82 159 R *xo;
Chris@82 160 xi = ri;
Chris@82 161 xo = ro;
Chris@82 162 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@82 163 V T5, Ta, TJ, Ty, Tq, Tp, Tg, Tl, TI, TA, Tz, Tu;
Chris@82 164 {
Chris@82 165 V T1, T6, T4, Tw, T9, Tx;
Chris@82 166 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 167 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 168 {
Chris@82 169 V T2, T3, T7, T8;
Chris@82 170 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 171 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 172 T4 = VADD(T2, T3);
Chris@82 173 Tw = VSUB(T3, T2);
Chris@82 174 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 175 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 176 T9 = VADD(T7, T8);
Chris@82 177 Tx = VSUB(T8, T7);
Chris@82 178 }
Chris@82 179 T5 = VADD(T1, T4);
Chris@82 180 Ta = VADD(T6, T9);
Chris@82 181 TJ = VADD(Tw, Tx);
Chris@82 182 Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx));
Chris@82 183 Tq = VFNMS(LDK(KP500000000), T9, T6);
Chris@82 184 Tp = VFNMS(LDK(KP500000000), T4, T1);
Chris@82 185 }
Chris@82 186 {
Chris@82 187 V Tc, Th, Tf, Ts, Tk, Tt;
Chris@82 188 Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 189 Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 190 {
Chris@82 191 V Td, Te, Ti, Tj;
Chris@82 192 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 193 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 194 Tf = VADD(Td, Te);
Chris@82 195 Ts = VSUB(Te, Td);
Chris@82 196 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 197 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 198 Tk = VADD(Ti, Tj);
Chris@82 199 Tt = VSUB(Tj, Ti);
Chris@82 200 }
Chris@82 201 Tg = VADD(Tc, Tf);
Chris@82 202 Tl = VADD(Th, Tk);
Chris@82 203 TI = VADD(Ts, Tt);
Chris@82 204 TA = VFNMS(LDK(KP500000000), Tk, Th);
Chris@82 205 Tz = VFNMS(LDK(KP500000000), Tf, Tc);
Chris@82 206 Tu = VMUL(LDK(KP866025403), VSUB(Ts, Tt));
Chris@82 207 }
Chris@82 208 {
Chris@82 209 V Tb, Tm, Tn, To;
Chris@82 210 Tb = VSUB(T5, Ta);
Chris@82 211 Tm = VBYI(VSUB(Tg, Tl));
Chris@82 212 ST(&(xo[WS(os, 9)]), VSUB(Tb, Tm), ovs, &(xo[WS(os, 1)]));
Chris@82 213 ST(&(xo[WS(os, 3)]), VADD(Tb, Tm), ovs, &(xo[WS(os, 1)]));
Chris@82 214 Tn = VADD(T5, Ta);
Chris@82 215 To = VADD(Tg, Tl);
Chris@82 216 ST(&(xo[WS(os, 6)]), VSUB(Tn, To), ovs, &(xo[0]));
Chris@82 217 ST(&(xo[0]), VADD(Tn, To), ovs, &(xo[0]));
Chris@82 218 }
Chris@82 219 {
Chris@82 220 V Tv, TE, TC, TD, Tr, TB;
Chris@82 221 Tr = VSUB(Tp, Tq);
Chris@82 222 Tv = VSUB(Tr, Tu);
Chris@82 223 TE = VADD(Tr, Tu);
Chris@82 224 TB = VSUB(Tz, TA);
Chris@82 225 TC = VBYI(VADD(Ty, TB));
Chris@82 226 TD = VBYI(VSUB(Ty, TB));
Chris@82 227 ST(&(xo[WS(os, 5)]), VSUB(Tv, TC), ovs, &(xo[WS(os, 1)]));
Chris@82 228 ST(&(xo[WS(os, 11)]), VSUB(TE, TD), ovs, &(xo[WS(os, 1)]));
Chris@82 229 ST(&(xo[WS(os, 7)]), VADD(TC, Tv), ovs, &(xo[WS(os, 1)]));
Chris@82 230 ST(&(xo[WS(os, 1)]), VADD(TD, TE), ovs, &(xo[WS(os, 1)]));
Chris@82 231 }
Chris@82 232 {
Chris@82 233 V TK, TM, TH, TL, TF, TG;
Chris@82 234 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
Chris@82 235 TM = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
Chris@82 236 TF = VADD(Tp, Tq);
Chris@82 237 TG = VADD(Tz, TA);
Chris@82 238 TH = VSUB(TF, TG);
Chris@82 239 TL = VADD(TF, TG);
Chris@82 240 ST(&(xo[WS(os, 10)]), VSUB(TH, TK), ovs, &(xo[0]));
Chris@82 241 ST(&(xo[WS(os, 4)]), VADD(TL, TM), ovs, &(xo[0]));
Chris@82 242 ST(&(xo[WS(os, 2)]), VADD(TH, TK), ovs, &(xo[0]));
Chris@82 243 ST(&(xo[WS(os, 8)]), VSUB(TL, TM), ovs, &(xo[0]));
Chris@82 244 }
Chris@82 245 }
Chris@82 246 }
Chris@82 247 VLEAVE();
Chris@82 248 }
Chris@82 249
Chris@82 250 static const kdft_desc desc = { 12, XSIMD_STRING("n1fv_12"), {44, 4, 4, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 251
Chris@82 252 void XSIMD(codelet_n1fv_12) (planner *p) {
Chris@82 253 X(kdft_register) (p, n1fv_12, &desc);
Chris@82 254 }
Chris@82 255
Chris@82 256 #endif