annotate src/fftw-3.3.5/dft/simd/common/n2fv_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:40:09 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n2fv_12 -with-ostride 2 -include n2f.h -store-multiple 2 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 48 FP additions, 20 FP multiplications,
Chris@42 32 * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
Chris@42 33 * 61 stack variables, 2 constants, and 30 memory accesses
Chris@42 34 */
Chris@42 35 #include "n2f.h"
Chris@42 36
Chris@42 37 static void n2fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT i;
Chris@42 43 const R *xi;
Chris@42 44 R *xo;
Chris@42 45 xi = ri;
Chris@42 46 xo = ro;
Chris@42 47 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@42 48 V T1, T6, Tk, Tn, Tc, Td, Tf, Tr, T4, Ts, T9, Tg, Te, Tl;
Chris@42 49 {
Chris@42 50 V T2, T3, T7, T8;
Chris@42 51 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 52 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 53 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 54 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 55 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 56 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 57 Tk = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 58 Tn = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 59 Tc = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 60 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 61 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 62 Tr = VSUB(T3, T2);
Chris@42 63 T4 = VADD(T2, T3);
Chris@42 64 Ts = VSUB(T8, T7);
Chris@42 65 T9 = VADD(T7, T8);
Chris@42 66 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 67 }
Chris@42 68 Te = VSUB(Tc, Td);
Chris@42 69 Tl = VADD(Td, Tc);
Chris@42 70 {
Chris@42 71 V T5, TF, TB, Tt, Ta, TG, Th, To, Tm, TI;
Chris@42 72 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 73 TF = VADD(T1, T4);
Chris@42 74 TB = VADD(Tr, Ts);
Chris@42 75 Tt = VSUB(Tr, Ts);
Chris@42 76 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 77 TG = VADD(T6, T9);
Chris@42 78 Th = VSUB(Tf, Tg);
Chris@42 79 To = VADD(Tf, Tg);
Chris@42 80 Tm = VFNMS(LDK(KP500000000), Tl, Tk);
Chris@42 81 TI = VADD(Tk, Tl);
Chris@42 82 {
Chris@42 83 V TH, TL, Tb, Tx, TJ, Tp, Ti, TA;
Chris@42 84 TH = VSUB(TF, TG);
Chris@42 85 TL = VADD(TF, TG);
Chris@42 86 Tb = VSUB(T5, Ta);
Chris@42 87 Tx = VADD(T5, Ta);
Chris@42 88 TJ = VADD(Tn, To);
Chris@42 89 Tp = VFNMS(LDK(KP500000000), To, Tn);
Chris@42 90 Ti = VADD(Te, Th);
Chris@42 91 TA = VSUB(Te, Th);
Chris@42 92 {
Chris@42 93 V Tq, Ty, TK, TM;
Chris@42 94 Tq = VSUB(Tm, Tp);
Chris@42 95 Ty = VADD(Tm, Tp);
Chris@42 96 TK = VSUB(TI, TJ);
Chris@42 97 TM = VADD(TI, TJ);
Chris@42 98 {
Chris@42 99 V TC, TE, Tj, Tv;
Chris@42 100 TC = VMUL(LDK(KP866025403), VSUB(TA, TB));
Chris@42 101 TE = VMUL(LDK(KP866025403), VADD(TB, TA));
Chris@42 102 Tj = VFMA(LDK(KP866025403), Ti, Tb);
Chris@42 103 Tv = VFNMS(LDK(KP866025403), Ti, Tb);
Chris@42 104 {
Chris@42 105 V Tz, TD, Tu, Tw;
Chris@42 106 Tz = VSUB(Tx, Ty);
Chris@42 107 TD = VADD(Tx, Ty);
Chris@42 108 Tu = VFNMS(LDK(KP866025403), Tt, Tq);
Chris@42 109 Tw = VFMA(LDK(KP866025403), Tt, Tq);
Chris@42 110 {
Chris@42 111 V TN, TO, TP, TQ;
Chris@42 112 TN = VADD(TL, TM);
Chris@42 113 STM2(&(xo[0]), TN, ovs, &(xo[0]));
Chris@42 114 TO = VSUB(TL, TM);
Chris@42 115 STM2(&(xo[12]), TO, ovs, &(xo[0]));
Chris@42 116 TP = VFMAI(TK, TH);
Chris@42 117 STM2(&(xo[6]), TP, ovs, &(xo[2]));
Chris@42 118 TQ = VFNMSI(TK, TH);
Chris@42 119 STM2(&(xo[18]), TQ, ovs, &(xo[2]));
Chris@42 120 {
Chris@42 121 V TR, TS, TT, TU;
Chris@42 122 TR = VFMAI(TE, TD);
Chris@42 123 STM2(&(xo[8]), TR, ovs, &(xo[0]));
Chris@42 124 TS = VFNMSI(TE, TD);
Chris@42 125 STM2(&(xo[16]), TS, ovs, &(xo[0]));
Chris@42 126 STN2(&(xo[16]), TS, TQ, ovs);
Chris@42 127 TT = VFNMSI(TC, Tz);
Chris@42 128 STM2(&(xo[20]), TT, ovs, &(xo[0]));
Chris@42 129 TU = VFMAI(TC, Tz);
Chris@42 130 STM2(&(xo[4]), TU, ovs, &(xo[0]));
Chris@42 131 STN2(&(xo[4]), TU, TP, ovs);
Chris@42 132 {
Chris@42 133 V TV, TW, TX, TY;
Chris@42 134 TV = VFNMSI(Tw, Tv);
Chris@42 135 STM2(&(xo[10]), TV, ovs, &(xo[2]));
Chris@42 136 STN2(&(xo[8]), TR, TV, ovs);
Chris@42 137 TW = VFMAI(Tw, Tv);
Chris@42 138 STM2(&(xo[14]), TW, ovs, &(xo[2]));
Chris@42 139 STN2(&(xo[12]), TO, TW, ovs);
Chris@42 140 TX = VFMAI(Tu, Tj);
Chris@42 141 STM2(&(xo[22]), TX, ovs, &(xo[2]));
Chris@42 142 STN2(&(xo[20]), TT, TX, ovs);
Chris@42 143 TY = VFNMSI(Tu, Tj);
Chris@42 144 STM2(&(xo[2]), TY, ovs, &(xo[2]));
Chris@42 145 STN2(&(xo[0]), TN, TY, ovs);
Chris@42 146 }
Chris@42 147 }
Chris@42 148 }
Chris@42 149 }
Chris@42 150 }
Chris@42 151 }
Chris@42 152 }
Chris@42 153 }
Chris@42 154 }
Chris@42 155 }
Chris@42 156 VLEAVE();
Chris@42 157 }
Chris@42 158
Chris@42 159 static const kdft_desc desc = { 12, XSIMD_STRING("n2fv_12"), {30, 2, 18, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 160
Chris@42 161 void XSIMD(codelet_n2fv_12) (planner *p) {
Chris@42 162 X(kdft_register) (p, n2fv_12, &desc);
Chris@42 163 }
Chris@42 164
Chris@42 165 #else /* HAVE_FMA */
Chris@42 166
Chris@42 167 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name n2fv_12 -with-ostride 2 -include n2f.h -store-multiple 2 */
Chris@42 168
Chris@42 169 /*
Chris@42 170 * This function contains 48 FP additions, 8 FP multiplications,
Chris@42 171 * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
Chris@42 172 * 33 stack variables, 2 constants, and 30 memory accesses
Chris@42 173 */
Chris@42 174 #include "n2f.h"
Chris@42 175
Chris@42 176 static void n2fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 177 {
Chris@42 178 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 179 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 180 {
Chris@42 181 INT i;
Chris@42 182 const R *xi;
Chris@42 183 R *xo;
Chris@42 184 xi = ri;
Chris@42 185 xo = ro;
Chris@42 186 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@42 187 V T5, Ta, TJ, Ty, Tq, Tp, Tg, Tl, TI, TA, Tz, Tu;
Chris@42 188 {
Chris@42 189 V T1, T6, T4, Tw, T9, Tx;
Chris@42 190 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 191 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 192 {
Chris@42 193 V T2, T3, T7, T8;
Chris@42 194 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 195 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 196 T4 = VADD(T2, T3);
Chris@42 197 Tw = VSUB(T3, T2);
Chris@42 198 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 199 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 200 T9 = VADD(T7, T8);
Chris@42 201 Tx = VSUB(T8, T7);
Chris@42 202 }
Chris@42 203 T5 = VADD(T1, T4);
Chris@42 204 Ta = VADD(T6, T9);
Chris@42 205 TJ = VADD(Tw, Tx);
Chris@42 206 Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx));
Chris@42 207 Tq = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 208 Tp = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 209 }
Chris@42 210 {
Chris@42 211 V Tc, Th, Tf, Ts, Tk, Tt;
Chris@42 212 Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 213 Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 214 {
Chris@42 215 V Td, Te, Ti, Tj;
Chris@42 216 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 217 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 218 Tf = VADD(Td, Te);
Chris@42 219 Ts = VSUB(Te, Td);
Chris@42 220 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 221 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 222 Tk = VADD(Ti, Tj);
Chris@42 223 Tt = VSUB(Tj, Ti);
Chris@42 224 }
Chris@42 225 Tg = VADD(Tc, Tf);
Chris@42 226 Tl = VADD(Th, Tk);
Chris@42 227 TI = VADD(Ts, Tt);
Chris@42 228 TA = VFNMS(LDK(KP500000000), Tk, Th);
Chris@42 229 Tz = VFNMS(LDK(KP500000000), Tf, Tc);
Chris@42 230 Tu = VMUL(LDK(KP866025403), VSUB(Ts, Tt));
Chris@42 231 }
Chris@42 232 {
Chris@42 233 V TN, TO, TP, TQ, TR, TS;
Chris@42 234 {
Chris@42 235 V Tb, Tm, Tn, To;
Chris@42 236 Tb = VSUB(T5, Ta);
Chris@42 237 Tm = VBYI(VSUB(Tg, Tl));
Chris@42 238 TN = VSUB(Tb, Tm);
Chris@42 239 STM2(&(xo[18]), TN, ovs, &(xo[2]));
Chris@42 240 TO = VADD(Tb, Tm);
Chris@42 241 STM2(&(xo[6]), TO, ovs, &(xo[2]));
Chris@42 242 Tn = VADD(T5, Ta);
Chris@42 243 To = VADD(Tg, Tl);
Chris@42 244 TP = VSUB(Tn, To);
Chris@42 245 STM2(&(xo[12]), TP, ovs, &(xo[0]));
Chris@42 246 TQ = VADD(Tn, To);
Chris@42 247 STM2(&(xo[0]), TQ, ovs, &(xo[0]));
Chris@42 248 }
Chris@42 249 {
Chris@42 250 V Tv, TE, TC, TD, Tr, TB, TT, TU;
Chris@42 251 Tr = VSUB(Tp, Tq);
Chris@42 252 Tv = VSUB(Tr, Tu);
Chris@42 253 TE = VADD(Tr, Tu);
Chris@42 254 TB = VSUB(Tz, TA);
Chris@42 255 TC = VBYI(VADD(Ty, TB));
Chris@42 256 TD = VBYI(VSUB(Ty, TB));
Chris@42 257 TR = VSUB(Tv, TC);
Chris@42 258 STM2(&(xo[10]), TR, ovs, &(xo[2]));
Chris@42 259 TS = VSUB(TE, TD);
Chris@42 260 STM2(&(xo[22]), TS, ovs, &(xo[2]));
Chris@42 261 TT = VADD(TC, Tv);
Chris@42 262 STM2(&(xo[14]), TT, ovs, &(xo[2]));
Chris@42 263 STN2(&(xo[12]), TP, TT, ovs);
Chris@42 264 TU = VADD(TD, TE);
Chris@42 265 STM2(&(xo[2]), TU, ovs, &(xo[2]));
Chris@42 266 STN2(&(xo[0]), TQ, TU, ovs);
Chris@42 267 }
Chris@42 268 {
Chris@42 269 V TK, TM, TH, TL, TF, TG;
Chris@42 270 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
Chris@42 271 TM = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
Chris@42 272 TF = VADD(Tp, Tq);
Chris@42 273 TG = VADD(Tz, TA);
Chris@42 274 TH = VSUB(TF, TG);
Chris@42 275 TL = VADD(TF, TG);
Chris@42 276 {
Chris@42 277 V TV, TW, TX, TY;
Chris@42 278 TV = VSUB(TH, TK);
Chris@42 279 STM2(&(xo[20]), TV, ovs, &(xo[0]));
Chris@42 280 STN2(&(xo[20]), TV, TS, ovs);
Chris@42 281 TW = VADD(TL, TM);
Chris@42 282 STM2(&(xo[8]), TW, ovs, &(xo[0]));
Chris@42 283 STN2(&(xo[8]), TW, TR, ovs);
Chris@42 284 TX = VADD(TH, TK);
Chris@42 285 STM2(&(xo[4]), TX, ovs, &(xo[0]));
Chris@42 286 STN2(&(xo[4]), TX, TO, ovs);
Chris@42 287 TY = VSUB(TL, TM);
Chris@42 288 STM2(&(xo[16]), TY, ovs, &(xo[0]));
Chris@42 289 STN2(&(xo[16]), TY, TN, ovs);
Chris@42 290 }
Chris@42 291 }
Chris@42 292 }
Chris@42 293 }
Chris@42 294 }
Chris@42 295 VLEAVE();
Chris@42 296 }
Chris@42 297
Chris@42 298 static const kdft_desc desc = { 12, XSIMD_STRING("n2fv_12"), {44, 4, 4, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 299
Chris@42 300 void XSIMD(codelet_n2fv_12) (planner *p) {
Chris@42 301 X(kdft_register) (p, n2fv_12, &desc);
Chris@42 302 }
Chris@42 303
Chris@42 304 #endif /* HAVE_FMA */