annotate src/fftw-3.3.5/dft/simd/common/n2bv_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:40:35 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n2bv_12 -with-ostride 2 -include n2b.h -store-multiple 2 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 48 FP additions, 20 FP multiplications,
Chris@42 32 * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
Chris@42 33 * 61 stack variables, 2 constants, and 30 memory accesses
Chris@42 34 */
Chris@42 35 #include "n2b.h"
Chris@42 36
Chris@42 37 static void n2bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT i;
Chris@42 43 const R *xi;
Chris@42 44 R *xo;
Chris@42 45 xi = ii;
Chris@42 46 xo = io;
Chris@42 47 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@42 48 V T1, T6, Tc, Th, Td, Te, Ti, Tz, T4, TA, T9, Tj, Tf, Tw;
Chris@42 49 {
Chris@42 50 V T2, T3, T7, T8;
Chris@42 51 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 52 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 53 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 54 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 55 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 56 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 57 Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 58 Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 59 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 60 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 61 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 62 Tz = VSUB(T2, T3);
Chris@42 63 T4 = VADD(T2, T3);
Chris@42 64 TA = VSUB(T7, T8);
Chris@42 65 T9 = VADD(T7, T8);
Chris@42 66 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 67 }
Chris@42 68 Tf = VADD(Td, Te);
Chris@42 69 Tw = VSUB(Td, Te);
Chris@42 70 {
Chris@42 71 V T5, Tp, TJ, TB, Ta, Tq, Tk, Tx, Tg, Ts;
Chris@42 72 T5 = VADD(T1, T4);
Chris@42 73 Tp = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 74 TJ = VSUB(Tz, TA);
Chris@42 75 TB = VADD(Tz, TA);
Chris@42 76 Ta = VADD(T6, T9);
Chris@42 77 Tq = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 78 Tk = VADD(Ti, Tj);
Chris@42 79 Tx = VSUB(Tj, Ti);
Chris@42 80 Tg = VADD(Tc, Tf);
Chris@42 81 Ts = VFNMS(LDK(KP500000000), Tf, Tc);
Chris@42 82 {
Chris@42 83 V Tr, TF, Tb, Tn, TG, Ty, Tl, Tt;
Chris@42 84 Tr = VADD(Tp, Tq);
Chris@42 85 TF = VSUB(Tp, Tq);
Chris@42 86 Tb = VSUB(T5, Ta);
Chris@42 87 Tn = VADD(T5, Ta);
Chris@42 88 TG = VADD(Tw, Tx);
Chris@42 89 Ty = VSUB(Tw, Tx);
Chris@42 90 Tl = VADD(Th, Tk);
Chris@42 91 Tt = VFNMS(LDK(KP500000000), Tk, Th);
Chris@42 92 {
Chris@42 93 V TC, TE, TH, TL, Tu, TI, Tm, To;
Chris@42 94 TC = VMUL(LDK(KP866025403), VSUB(Ty, TB));
Chris@42 95 TE = VMUL(LDK(KP866025403), VADD(TB, Ty));
Chris@42 96 TH = VFNMS(LDK(KP866025403), TG, TF);
Chris@42 97 TL = VFMA(LDK(KP866025403), TG, TF);
Chris@42 98 Tu = VADD(Ts, Tt);
Chris@42 99 TI = VSUB(Ts, Tt);
Chris@42 100 Tm = VSUB(Tg, Tl);
Chris@42 101 To = VADD(Tg, Tl);
Chris@42 102 {
Chris@42 103 V TK, TM, Tv, TD;
Chris@42 104 TK = VFMA(LDK(KP866025403), TJ, TI);
Chris@42 105 TM = VFNMS(LDK(KP866025403), TJ, TI);
Chris@42 106 Tv = VSUB(Tr, Tu);
Chris@42 107 TD = VADD(Tr, Tu);
Chris@42 108 {
Chris@42 109 V TN, TO, TP, TQ;
Chris@42 110 TN = VADD(Tn, To);
Chris@42 111 STM2(&(xo[0]), TN, ovs, &(xo[0]));
Chris@42 112 TO = VSUB(Tn, To);
Chris@42 113 STM2(&(xo[12]), TO, ovs, &(xo[0]));
Chris@42 114 TP = VFMAI(Tm, Tb);
Chris@42 115 STM2(&(xo[18]), TP, ovs, &(xo[2]));
Chris@42 116 TQ = VFNMSI(Tm, Tb);
Chris@42 117 STM2(&(xo[6]), TQ, ovs, &(xo[2]));
Chris@42 118 {
Chris@42 119 V TR, TS, TT, TU;
Chris@42 120 TR = VFMAI(TM, TL);
Chris@42 121 STM2(&(xo[10]), TR, ovs, &(xo[2]));
Chris@42 122 TS = VFNMSI(TM, TL);
Chris@42 123 STM2(&(xo[14]), TS, ovs, &(xo[2]));
Chris@42 124 STN2(&(xo[12]), TO, TS, ovs);
Chris@42 125 TT = VFNMSI(TK, TH);
Chris@42 126 STM2(&(xo[22]), TT, ovs, &(xo[2]));
Chris@42 127 TU = VFMAI(TK, TH);
Chris@42 128 STM2(&(xo[2]), TU, ovs, &(xo[2]));
Chris@42 129 STN2(&(xo[0]), TN, TU, ovs);
Chris@42 130 {
Chris@42 131 V TV, TW, TX, TY;
Chris@42 132 TV = VFNMSI(TE, TD);
Chris@42 133 STM2(&(xo[16]), TV, ovs, &(xo[0]));
Chris@42 134 STN2(&(xo[16]), TV, TP, ovs);
Chris@42 135 TW = VFMAI(TE, TD);
Chris@42 136 STM2(&(xo[8]), TW, ovs, &(xo[0]));
Chris@42 137 STN2(&(xo[8]), TW, TR, ovs);
Chris@42 138 TX = VFMAI(TC, Tv);
Chris@42 139 STM2(&(xo[4]), TX, ovs, &(xo[0]));
Chris@42 140 STN2(&(xo[4]), TX, TQ, ovs);
Chris@42 141 TY = VFNMSI(TC, Tv);
Chris@42 142 STM2(&(xo[20]), TY, ovs, &(xo[0]));
Chris@42 143 STN2(&(xo[20]), TY, TT, ovs);
Chris@42 144 }
Chris@42 145 }
Chris@42 146 }
Chris@42 147 }
Chris@42 148 }
Chris@42 149 }
Chris@42 150 }
Chris@42 151 }
Chris@42 152 }
Chris@42 153 VLEAVE();
Chris@42 154 }
Chris@42 155
Chris@42 156 static const kdft_desc desc = { 12, XSIMD_STRING("n2bv_12"), {30, 2, 18, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 157
Chris@42 158 void XSIMD(codelet_n2bv_12) (planner *p) {
Chris@42 159 X(kdft_register) (p, n2bv_12, &desc);
Chris@42 160 }
Chris@42 161
Chris@42 162 #else /* HAVE_FMA */
Chris@42 163
Chris@42 164 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n2bv_12 -with-ostride 2 -include n2b.h -store-multiple 2 */
Chris@42 165
Chris@42 166 /*
Chris@42 167 * This function contains 48 FP additions, 8 FP multiplications,
Chris@42 168 * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
Chris@42 169 * 33 stack variables, 2 constants, and 30 memory accesses
Chris@42 170 */
Chris@42 171 #include "n2b.h"
Chris@42 172
Chris@42 173 static void n2bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 174 {
Chris@42 175 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 176 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 177 {
Chris@42 178 INT i;
Chris@42 179 const R *xi;
Chris@42 180 R *xo;
Chris@42 181 xi = ii;
Chris@42 182 xo = io;
Chris@42 183 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
Chris@42 184 V T5, Ta, TG, TF, Ty, Tm, Ti, Tp, TJ, TI, Tx, Ts;
Chris@42 185 {
Chris@42 186 V T1, T6, T4, Tk, T9, Tl;
Chris@42 187 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 188 T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 189 {
Chris@42 190 V T2, T3, T7, T8;
Chris@42 191 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 192 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 193 T4 = VADD(T2, T3);
Chris@42 194 Tk = VSUB(T2, T3);
Chris@42 195 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 196 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 197 T9 = VADD(T7, T8);
Chris@42 198 Tl = VSUB(T7, T8);
Chris@42 199 }
Chris@42 200 T5 = VFNMS(LDK(KP500000000), T4, T1);
Chris@42 201 Ta = VFNMS(LDK(KP500000000), T9, T6);
Chris@42 202 TG = VADD(T6, T9);
Chris@42 203 TF = VADD(T1, T4);
Chris@42 204 Ty = VADD(Tk, Tl);
Chris@42 205 Tm = VMUL(LDK(KP866025403), VSUB(Tk, Tl));
Chris@42 206 }
Chris@42 207 {
Chris@42 208 V Tn, Tq, Te, To, Th, Tr;
Chris@42 209 Tn = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 210 Tq = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 211 {
Chris@42 212 V Tc, Td, Tf, Tg;
Chris@42 213 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 214 Td = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 215 Te = VSUB(Tc, Td);
Chris@42 216 To = VADD(Tc, Td);
Chris@42 217 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 218 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 219 Th = VSUB(Tf, Tg);
Chris@42 220 Tr = VADD(Tf, Tg);
Chris@42 221 }
Chris@42 222 Ti = VMUL(LDK(KP866025403), VSUB(Te, Th));
Chris@42 223 Tp = VFNMS(LDK(KP500000000), To, Tn);
Chris@42 224 TJ = VADD(Tq, Tr);
Chris@42 225 TI = VADD(Tn, To);
Chris@42 226 Tx = VADD(Te, Th);
Chris@42 227 Ts = VFNMS(LDK(KP500000000), Tr, Tq);
Chris@42 228 }
Chris@42 229 {
Chris@42 230 V TN, TO, TP, TQ, TR, TS;
Chris@42 231 {
Chris@42 232 V TH, TK, TL, TM;
Chris@42 233 TH = VSUB(TF, TG);
Chris@42 234 TK = VBYI(VSUB(TI, TJ));
Chris@42 235 TN = VSUB(TH, TK);
Chris@42 236 STM2(&(xo[6]), TN, ovs, &(xo[2]));
Chris@42 237 TO = VADD(TH, TK);
Chris@42 238 STM2(&(xo[18]), TO, ovs, &(xo[2]));
Chris@42 239 TL = VADD(TF, TG);
Chris@42 240 TM = VADD(TI, TJ);
Chris@42 241 TP = VSUB(TL, TM);
Chris@42 242 STM2(&(xo[12]), TP, ovs, &(xo[0]));
Chris@42 243 TQ = VADD(TL, TM);
Chris@42 244 STM2(&(xo[0]), TQ, ovs, &(xo[0]));
Chris@42 245 }
Chris@42 246 {
Chris@42 247 V Tj, Tv, Tu, Tw, Tb, Tt, TT, TU;
Chris@42 248 Tb = VSUB(T5, Ta);
Chris@42 249 Tj = VSUB(Tb, Ti);
Chris@42 250 Tv = VADD(Tb, Ti);
Chris@42 251 Tt = VSUB(Tp, Ts);
Chris@42 252 Tu = VBYI(VADD(Tm, Tt));
Chris@42 253 Tw = VBYI(VSUB(Tt, Tm));
Chris@42 254 TR = VSUB(Tj, Tu);
Chris@42 255 STM2(&(xo[22]), TR, ovs, &(xo[2]));
Chris@42 256 TS = VADD(Tv, Tw);
Chris@42 257 STM2(&(xo[10]), TS, ovs, &(xo[2]));
Chris@42 258 TT = VADD(Tj, Tu);
Chris@42 259 STM2(&(xo[2]), TT, ovs, &(xo[2]));
Chris@42 260 STN2(&(xo[0]), TQ, TT, ovs);
Chris@42 261 TU = VSUB(Tv, Tw);
Chris@42 262 STM2(&(xo[14]), TU, ovs, &(xo[2]));
Chris@42 263 STN2(&(xo[12]), TP, TU, ovs);
Chris@42 264 }
Chris@42 265 {
Chris@42 266 V Tz, TD, TC, TE, TA, TB;
Chris@42 267 Tz = VBYI(VMUL(LDK(KP866025403), VSUB(Tx, Ty)));
Chris@42 268 TD = VBYI(VMUL(LDK(KP866025403), VADD(Ty, Tx)));
Chris@42 269 TA = VADD(T5, Ta);
Chris@42 270 TB = VADD(Tp, Ts);
Chris@42 271 TC = VSUB(TA, TB);
Chris@42 272 TE = VADD(TA, TB);
Chris@42 273 {
Chris@42 274 V TV, TW, TX, TY;
Chris@42 275 TV = VADD(Tz, TC);
Chris@42 276 STM2(&(xo[4]), TV, ovs, &(xo[0]));
Chris@42 277 STN2(&(xo[4]), TV, TN, ovs);
Chris@42 278 TW = VSUB(TE, TD);
Chris@42 279 STM2(&(xo[16]), TW, ovs, &(xo[0]));
Chris@42 280 STN2(&(xo[16]), TW, TO, ovs);
Chris@42 281 TX = VSUB(TC, Tz);
Chris@42 282 STM2(&(xo[20]), TX, ovs, &(xo[0]));
Chris@42 283 STN2(&(xo[20]), TX, TR, ovs);
Chris@42 284 TY = VADD(TD, TE);
Chris@42 285 STM2(&(xo[8]), TY, ovs, &(xo[0]));
Chris@42 286 STN2(&(xo[8]), TY, TS, ovs);
Chris@42 287 }
Chris@42 288 }
Chris@42 289 }
Chris@42 290 }
Chris@42 291 }
Chris@42 292 VLEAVE();
Chris@42 293 }
Chris@42 294
Chris@42 295 static const kdft_desc desc = { 12, XSIMD_STRING("n2bv_12"), {44, 4, 4, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 296
Chris@42 297 void XSIMD(codelet_n2bv_12) (planner *p) {
Chris@42 298 X(kdft_register) (p, n2bv_12, &desc);
Chris@42 299 }
Chris@42 300
Chris@42 301 #endif /* HAVE_FMA */