annotate src/fftw-3.3.8/dft/simd/common/n2fv_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:07 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n2fv_12 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 48 FP additions, 20 FP multiplications,
Chris@82 32 * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
Chris@82 33 * 33 stack variables, 2 constants, and 30 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2f.h"
Chris@82 36
Chris@82 37 static void n2fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT i;
Chris@82 43 const R *xi;
Chris@82 44 R *xo;
Chris@82 45 xi = ri;
Chris@82 46 xo = ro;
Chris@82 47 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@82 48 V T5, Ta, TG, TF, TB, Tt, Ti, Tm, TJ, TI, TA, Tp;
Chris@82 49 {
Chris@82 50 V T1, T6, T4, Tr, T9, Ts;
Chris@82 51 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 52 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 53 {
Chris@82 54 V T2, T3, T7, T8;
Chris@82 55 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 56 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 57 T4 = VADD(T2, T3);
Chris@82 58 Tr = VSUB(T3, T2);
Chris@82 59 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 60 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 61 T9 = VADD(T7, T8);
Chris@82 62 Ts = VSUB(T8, T7);
Chris@82 63 }
Chris@82 64 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@82 65 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@82 66 TG = VADD(T6, T9);
Chris@82 67 TF = VADD(T1, T4);
Chris@82 68 TB = VADD(Tr, Ts);
Chris@82 69 Tt = VSUB(Tr, Ts);
Chris@82 70 }
Chris@82 71 {
Chris@82 72 V Tk, Tn, Te, Tl, Th, To;
Chris@82 73 Tk = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 74 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 75 {
Chris@82 76 V Tc, Td, Tf, Tg;
Chris@82 77 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 78 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 79 Te = VSUB(Tc, Td);
Chris@82 80 Tl = VADD(Td, Tc);
Chris@82 81 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 82 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 83 Th = VSUB(Tf, Tg);
Chris@82 84 To = VADD(Tf, Tg);
Chris@82 85 }
Chris@82 86 Ti = VADD(Te, Th);
Chris@82 87 Tm = VFNMS(LDK(KP500000000), Tl, Tk);
Chris@82 88 TJ = VADD(Tn, To);
Chris@82 89 TI = VADD(Tk, Tl);
Chris@82 90 TA = VSUB(Te, Th);
Chris@82 91 Tp = VFNMS(LDK(KP500000000), To, Tn);
Chris@82 92 }
Chris@82 93 {
Chris@82 94 V TN, TO, TP, TQ, TT, TU;
Chris@82 95 {
Chris@82 96 V TH, TK, TL, TM;
Chris@82 97 TH = VSUB(TF, TG);
Chris@82 98 TK = VSUB(TI, TJ);
Chris@82 99 TN = VFNMSI(TK, TH);
Chris@82 100 STM2(&(xo[18]), TN, ovs, &(xo[2]));
Chris@82 101 TO = VFMAI(TK, TH);
Chris@82 102 STM2(&(xo[6]), TO, ovs, &(xo[2]));
Chris@82 103 TL = VADD(TF, TG);
Chris@82 104 TM = VADD(TI, TJ);
Chris@82 105 TP = VSUB(TL, TM);
Chris@82 106 STM2(&(xo[12]), TP, ovs, &(xo[0]));
Chris@82 107 TQ = VADD(TL, TM);
Chris@82 108 STM2(&(xo[0]), TQ, ovs, &(xo[0]));
Chris@82 109 }
Chris@82 110 {
Chris@82 111 V Tj, Tv, Tu, Tw, Tb, Tq, TR, TS;
Chris@82 112 Tb = VSUB(T5, Ta);
Chris@82 113 Tj = VFMA(LDK(KP866025403), Ti, Tb);
Chris@82 114 Tv = VFNMS(LDK(KP866025403), Ti, Tb);
Chris@82 115 Tq = VSUB(Tm, Tp);
Chris@82 116 Tu = VFNMS(LDK(KP866025403), Tt, Tq);
Chris@82 117 Tw = VFMA(LDK(KP866025403), Tt, Tq);
Chris@82 118 TR = VFNMSI(Tu, Tj);
Chris@82 119 STM2(&(xo[2]), TR, ovs, &(xo[2]));
Chris@82 120 STN2(&(xo[0]), TQ, TR, ovs);
Chris@82 121 TS = VFMAI(Tw, Tv);
Chris@82 122 STM2(&(xo[14]), TS, ovs, &(xo[2]));
Chris@82 123 STN2(&(xo[12]), TP, TS, ovs);
Chris@82 124 TT = VFMAI(Tu, Tj);
Chris@82 125 STM2(&(xo[22]), TT, ovs, &(xo[2]));
Chris@82 126 TU = VFNMSI(Tw, Tv);
Chris@82 127 STM2(&(xo[10]), TU, ovs, &(xo[2]));
Chris@82 128 }
Chris@82 129 {
Chris@82 130 V TC, TE, Tz, TD, Tx, Ty;
Chris@82 131 TC = VMUL(LDK(KP866025403), VSUB(TA, TB));
Chris@82 132 TE = VMUL(LDK(KP866025403), VADD(TB, TA));
Chris@82 133 Tx = VADD(T5, Ta);
Chris@82 134 Ty = VADD(Tm, Tp);
Chris@82 135 Tz = VSUB(Tx, Ty);
Chris@82 136 TD = VADD(Tx, Ty);
Chris@82 137 {
Chris@82 138 V TV, TW, TX, TY;
Chris@82 139 TV = VFMAI(TC, Tz);
Chris@82 140 STM2(&(xo[4]), TV, ovs, &(xo[0]));
Chris@82 141 STN2(&(xo[4]), TV, TO, ovs);
Chris@82 142 TW = VFNMSI(TE, TD);
Chris@82 143 STM2(&(xo[16]), TW, ovs, &(xo[0]));
Chris@82 144 STN2(&(xo[16]), TW, TN, ovs);
Chris@82 145 TX = VFNMSI(TC, Tz);
Chris@82 146 STM2(&(xo[20]), TX, ovs, &(xo[0]));
Chris@82 147 STN2(&(xo[20]), TX, TT, ovs);
Chris@82 148 TY = VFMAI(TE, TD);
Chris@82 149 STM2(&(xo[8]), TY, ovs, &(xo[0]));
Chris@82 150 STN2(&(xo[8]), TY, TU, ovs);
Chris@82 151 }
Chris@82 152 }
Chris@82 153 }
Chris@82 154 }
Chris@82 155 }
Chris@82 156 VLEAVE();
Chris@82 157 }
Chris@82 158
Chris@82 159 static const kdft_desc desc = { 12, XSIMD_STRING("n2fv_12"), {30, 2, 18, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 160
Chris@82 161 void XSIMD(codelet_n2fv_12) (planner *p) {
Chris@82 162 X(kdft_register) (p, n2fv_12, &desc);
Chris@82 163 }
Chris@82 164
Chris@82 165 #else
Chris@82 166
Chris@82 167 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n2fv_12 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 168
Chris@82 169 /*
Chris@82 170 * This function contains 48 FP additions, 8 FP multiplications,
Chris@82 171 * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
Chris@82 172 * 33 stack variables, 2 constants, and 30 memory accesses
Chris@82 173 */
Chris@82 174 #include "dft/simd/n2f.h"
Chris@82 175
Chris@82 176 static void n2fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 177 {
Chris@82 178 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 179 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 180 {
Chris@82 181 INT i;
Chris@82 182 const R *xi;
Chris@82 183 R *xo;
Chris@82 184 xi = ri;
Chris@82 185 xo = ro;
Chris@82 186 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@82 187 V T5, Ta, TJ, Ty, Tq, Tp, Tg, Tl, TI, TA, Tz, Tu;
Chris@82 188 {
Chris@82 189 V T1, T6, T4, Tw, T9, Tx;
Chris@82 190 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 191 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 192 {
Chris@82 193 V T2, T3, T7, T8;
Chris@82 194 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 195 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 196 T4 = VADD(T2, T3);
Chris@82 197 Tw = VSUB(T3, T2);
Chris@82 198 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 199 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 200 T9 = VADD(T7, T8);
Chris@82 201 Tx = VSUB(T8, T7);
Chris@82 202 }
Chris@82 203 T5 = VADD(T1, T4);
Chris@82 204 Ta = VADD(T6, T9);
Chris@82 205 TJ = VADD(Tw, Tx);
Chris@82 206 Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx));
Chris@82 207 Tq = VFNMS(LDK(KP500000000), T9, T6);
Chris@82 208 Tp = VFNMS(LDK(KP500000000), T4, T1);
Chris@82 209 }
Chris@82 210 {
Chris@82 211 V Tc, Th, Tf, Ts, Tk, Tt;
Chris@82 212 Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 213 Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 214 {
Chris@82 215 V Td, Te, Ti, Tj;
Chris@82 216 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 217 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 218 Tf = VADD(Td, Te);
Chris@82 219 Ts = VSUB(Te, Td);
Chris@82 220 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 221 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 222 Tk = VADD(Ti, Tj);
Chris@82 223 Tt = VSUB(Tj, Ti);
Chris@82 224 }
Chris@82 225 Tg = VADD(Tc, Tf);
Chris@82 226 Tl = VADD(Th, Tk);
Chris@82 227 TI = VADD(Ts, Tt);
Chris@82 228 TA = VFNMS(LDK(KP500000000), Tk, Th);
Chris@82 229 Tz = VFNMS(LDK(KP500000000), Tf, Tc);
Chris@82 230 Tu = VMUL(LDK(KP866025403), VSUB(Ts, Tt));
Chris@82 231 }
Chris@82 232 {
Chris@82 233 V TN, TO, TP, TQ, TR, TS;
Chris@82 234 {
Chris@82 235 V Tb, Tm, Tn, To;
Chris@82 236 Tb = VSUB(T5, Ta);
Chris@82 237 Tm = VBYI(VSUB(Tg, Tl));
Chris@82 238 TN = VSUB(Tb, Tm);
Chris@82 239 STM2(&(xo[18]), TN, ovs, &(xo[2]));
Chris@82 240 TO = VADD(Tb, Tm);
Chris@82 241 STM2(&(xo[6]), TO, ovs, &(xo[2]));
Chris@82 242 Tn = VADD(T5, Ta);
Chris@82 243 To = VADD(Tg, Tl);
Chris@82 244 TP = VSUB(Tn, To);
Chris@82 245 STM2(&(xo[12]), TP, ovs, &(xo[0]));
Chris@82 246 TQ = VADD(Tn, To);
Chris@82 247 STM2(&(xo[0]), TQ, ovs, &(xo[0]));
Chris@82 248 }
Chris@82 249 {
Chris@82 250 V Tv, TE, TC, TD, Tr, TB, TT, TU;
Chris@82 251 Tr = VSUB(Tp, Tq);
Chris@82 252 Tv = VSUB(Tr, Tu);
Chris@82 253 TE = VADD(Tr, Tu);
Chris@82 254 TB = VSUB(Tz, TA);
Chris@82 255 TC = VBYI(VADD(Ty, TB));
Chris@82 256 TD = VBYI(VSUB(Ty, TB));
Chris@82 257 TR = VSUB(Tv, TC);
Chris@82 258 STM2(&(xo[10]), TR, ovs, &(xo[2]));
Chris@82 259 TS = VSUB(TE, TD);
Chris@82 260 STM2(&(xo[22]), TS, ovs, &(xo[2]));
Chris@82 261 TT = VADD(TC, Tv);
Chris@82 262 STM2(&(xo[14]), TT, ovs, &(xo[2]));
Chris@82 263 STN2(&(xo[12]), TP, TT, ovs);
Chris@82 264 TU = VADD(TD, TE);
Chris@82 265 STM2(&(xo[2]), TU, ovs, &(xo[2]));
Chris@82 266 STN2(&(xo[0]), TQ, TU, ovs);
Chris@82 267 }
Chris@82 268 {
Chris@82 269 V TK, TM, TH, TL, TF, TG;
Chris@82 270 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
Chris@82 271 TM = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
Chris@82 272 TF = VADD(Tp, Tq);
Chris@82 273 TG = VADD(Tz, TA);
Chris@82 274 TH = VSUB(TF, TG);
Chris@82 275 TL = VADD(TF, TG);
Chris@82 276 {
Chris@82 277 V TV, TW, TX, TY;
Chris@82 278 TV = VSUB(TH, TK);
Chris@82 279 STM2(&(xo[20]), TV, ovs, &(xo[0]));
Chris@82 280 STN2(&(xo[20]), TV, TS, ovs);
Chris@82 281 TW = VADD(TL, TM);
Chris@82 282 STM2(&(xo[8]), TW, ovs, &(xo[0]));
Chris@82 283 STN2(&(xo[8]), TW, TR, ovs);
Chris@82 284 TX = VADD(TH, TK);
Chris@82 285 STM2(&(xo[4]), TX, ovs, &(xo[0]));
Chris@82 286 STN2(&(xo[4]), TX, TO, ovs);
Chris@82 287 TY = VSUB(TL, TM);
Chris@82 288 STM2(&(xo[16]), TY, ovs, &(xo[0]));
Chris@82 289 STN2(&(xo[16]), TY, TN, ovs);
Chris@82 290 }
Chris@82 291 }
Chris@82 292 }
Chris@82 293 }
Chris@82 294 }
Chris@82 295 VLEAVE();
Chris@82 296 }
Chris@82 297
Chris@82 298 static const kdft_desc desc = { 12, XSIMD_STRING("n2fv_12"), {44, 4, 4, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 299
Chris@82 300 void XSIMD(codelet_n2fv_12) (planner *p) {
Chris@82 301 X(kdft_register) (p, n2fv_12, &desc);
Chris@82 302 }
Chris@82 303
Chris@82 304 #endif