annotate src/fftw-3.3.8/dft/simd/common/n1fv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:52 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n1fv_16 -include dft/simd/n1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 72 FP additions, 34 FP multiplications,
Chris@82 32 * (or, 38 additions, 0 multiplications, 34 fused multiply/add),
Chris@82 33 * 30 stack variables, 3 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1f.h"
Chris@82 36
Chris@82 37 static void n1fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 41 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 42 {
Chris@82 43 INT i;
Chris@82 44 const R *xi;
Chris@82 45 R *xo;
Chris@82 46 xi = ri;
Chris@82 47 xo = ro;
Chris@82 48 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@82 49 V T7, TU, Tz, TH, Tu, TV, TA, TK, Te, TX, TC, TO, Tl, TY, TD;
Chris@82 50 V TR;
Chris@82 51 {
Chris@82 52 V T1, T2, T3, T4, T5, T6;
Chris@82 53 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 54 T2 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 55 T3 = VADD(T1, T2);
Chris@82 56 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 57 T5 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 58 T6 = VADD(T4, T5);
Chris@82 59 T7 = VSUB(T3, T6);
Chris@82 60 TU = VSUB(T4, T5);
Chris@82 61 Tz = VADD(T3, T6);
Chris@82 62 TH = VSUB(T1, T2);
Chris@82 63 }
Chris@82 64 {
Chris@82 65 V Tq, TJ, Tt, TI;
Chris@82 66 {
Chris@82 67 V To, Tp, Tr, Ts;
Chris@82 68 To = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 69 Tp = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 70 Tq = VADD(To, Tp);
Chris@82 71 TJ = VSUB(To, Tp);
Chris@82 72 Tr = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 73 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 74 Tt = VADD(Tr, Ts);
Chris@82 75 TI = VSUB(Tr, Ts);
Chris@82 76 }
Chris@82 77 Tu = VSUB(Tq, Tt);
Chris@82 78 TV = VSUB(TJ, TI);
Chris@82 79 TA = VADD(Tt, Tq);
Chris@82 80 TK = VADD(TI, TJ);
Chris@82 81 }
Chris@82 82 {
Chris@82 83 V Ta, TM, Td, TN;
Chris@82 84 {
Chris@82 85 V T8, T9, Tb, Tc;
Chris@82 86 T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 87 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 88 Ta = VADD(T8, T9);
Chris@82 89 TM = VSUB(T8, T9);
Chris@82 90 Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 91 Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 92 Td = VADD(Tb, Tc);
Chris@82 93 TN = VSUB(Tb, Tc);
Chris@82 94 }
Chris@82 95 Te = VSUB(Ta, Td);
Chris@82 96 TX = VFMA(LDK(KP414213562), TM, TN);
Chris@82 97 TC = VADD(Ta, Td);
Chris@82 98 TO = VFNMS(LDK(KP414213562), TN, TM);
Chris@82 99 }
Chris@82 100 {
Chris@82 101 V Th, TP, Tk, TQ;
Chris@82 102 {
Chris@82 103 V Tf, Tg, Ti, Tj;
Chris@82 104 Tf = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 105 Tg = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 106 Th = VADD(Tf, Tg);
Chris@82 107 TP = VSUB(Tf, Tg);
Chris@82 108 Ti = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 109 Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 110 Tk = VADD(Ti, Tj);
Chris@82 111 TQ = VSUB(Tj, Ti);
Chris@82 112 }
Chris@82 113 Tl = VSUB(Th, Tk);
Chris@82 114 TY = VFMA(LDK(KP414213562), TP, TQ);
Chris@82 115 TD = VADD(Th, Tk);
Chris@82 116 TR = VFNMS(LDK(KP414213562), TQ, TP);
Chris@82 117 }
Chris@82 118 {
Chris@82 119 V TB, TE, TF, TG;
Chris@82 120 TB = VADD(Tz, TA);
Chris@82 121 TE = VADD(TC, TD);
Chris@82 122 ST(&(xo[WS(os, 8)]), VSUB(TB, TE), ovs, &(xo[0]));
Chris@82 123 ST(&(xo[0]), VADD(TB, TE), ovs, &(xo[0]));
Chris@82 124 TF = VSUB(Tz, TA);
Chris@82 125 TG = VSUB(TD, TC);
Chris@82 126 ST(&(xo[WS(os, 12)]), VFNMSI(TG, TF), ovs, &(xo[0]));
Chris@82 127 ST(&(xo[WS(os, 4)]), VFMAI(TG, TF), ovs, &(xo[0]));
Chris@82 128 }
Chris@82 129 {
Chris@82 130 V Tn, Tx, Tw, Ty, Tm, Tv;
Chris@82 131 Tm = VADD(Te, Tl);
Chris@82 132 Tn = VFNMS(LDK(KP707106781), Tm, T7);
Chris@82 133 Tx = VFMA(LDK(KP707106781), Tm, T7);
Chris@82 134 Tv = VSUB(Tl, Te);
Chris@82 135 Tw = VFNMS(LDK(KP707106781), Tv, Tu);
Chris@82 136 Ty = VFMA(LDK(KP707106781), Tv, Tu);
Chris@82 137 ST(&(xo[WS(os, 6)]), VFNMSI(Tw, Tn), ovs, &(xo[0]));
Chris@82 138 ST(&(xo[WS(os, 2)]), VFMAI(Ty, Tx), ovs, &(xo[0]));
Chris@82 139 ST(&(xo[WS(os, 10)]), VFMAI(Tw, Tn), ovs, &(xo[0]));
Chris@82 140 ST(&(xo[WS(os, 14)]), VFNMSI(Ty, Tx), ovs, &(xo[0]));
Chris@82 141 }
Chris@82 142 {
Chris@82 143 V TT, T11, T10, T12;
Chris@82 144 {
Chris@82 145 V TL, TS, TW, TZ;
Chris@82 146 TL = VFMA(LDK(KP707106781), TK, TH);
Chris@82 147 TS = VADD(TO, TR);
Chris@82 148 TT = VFNMS(LDK(KP923879532), TS, TL);
Chris@82 149 T11 = VFMA(LDK(KP923879532), TS, TL);
Chris@82 150 TW = VFNMS(LDK(KP707106781), TV, TU);
Chris@82 151 TZ = VSUB(TX, TY);
Chris@82 152 T10 = VFNMS(LDK(KP923879532), TZ, TW);
Chris@82 153 T12 = VFMA(LDK(KP923879532), TZ, TW);
Chris@82 154 }
Chris@82 155 ST(&(xo[WS(os, 9)]), VFNMSI(T10, TT), ovs, &(xo[WS(os, 1)]));
Chris@82 156 ST(&(xo[WS(os, 15)]), VFMAI(T12, T11), ovs, &(xo[WS(os, 1)]));
Chris@82 157 ST(&(xo[WS(os, 7)]), VFMAI(T10, TT), ovs, &(xo[WS(os, 1)]));
Chris@82 158 ST(&(xo[WS(os, 1)]), VFNMSI(T12, T11), ovs, &(xo[WS(os, 1)]));
Chris@82 159 }
Chris@82 160 {
Chris@82 161 V T15, T19, T18, T1a;
Chris@82 162 {
Chris@82 163 V T13, T14, T16, T17;
Chris@82 164 T13 = VFNMS(LDK(KP707106781), TK, TH);
Chris@82 165 T14 = VADD(TX, TY);
Chris@82 166 T15 = VFNMS(LDK(KP923879532), T14, T13);
Chris@82 167 T19 = VFMA(LDK(KP923879532), T14, T13);
Chris@82 168 T16 = VFMA(LDK(KP707106781), TV, TU);
Chris@82 169 T17 = VSUB(TR, TO);
Chris@82 170 T18 = VFNMS(LDK(KP923879532), T17, T16);
Chris@82 171 T1a = VFMA(LDK(KP923879532), T17, T16);
Chris@82 172 }
Chris@82 173 ST(&(xo[WS(os, 5)]), VFNMSI(T18, T15), ovs, &(xo[WS(os, 1)]));
Chris@82 174 ST(&(xo[WS(os, 13)]), VFNMSI(T1a, T19), ovs, &(xo[WS(os, 1)]));
Chris@82 175 ST(&(xo[WS(os, 11)]), VFMAI(T18, T15), ovs, &(xo[WS(os, 1)]));
Chris@82 176 ST(&(xo[WS(os, 3)]), VFMAI(T1a, T19), ovs, &(xo[WS(os, 1)]));
Chris@82 177 }
Chris@82 178 }
Chris@82 179 }
Chris@82 180 VLEAVE();
Chris@82 181 }
Chris@82 182
Chris@82 183 static const kdft_desc desc = { 16, XSIMD_STRING("n1fv_16"), {38, 0, 34, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 184
Chris@82 185 void XSIMD(codelet_n1fv_16) (planner *p) {
Chris@82 186 X(kdft_register) (p, n1fv_16, &desc);
Chris@82 187 }
Chris@82 188
Chris@82 189 #else
Chris@82 190
Chris@82 191 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n1fv_16 -include dft/simd/n1f.h */
Chris@82 192
Chris@82 193 /*
Chris@82 194 * This function contains 72 FP additions, 12 FP multiplications,
Chris@82 195 * (or, 68 additions, 8 multiplications, 4 fused multiply/add),
Chris@82 196 * 30 stack variables, 3 constants, and 32 memory accesses
Chris@82 197 */
Chris@82 198 #include "dft/simd/n1f.h"
Chris@82 199
Chris@82 200 static void n1fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 201 {
Chris@82 202 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 203 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 204 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 205 {
Chris@82 206 INT i;
Chris@82 207 const R *xi;
Chris@82 208 R *xo;
Chris@82 209 xi = ri;
Chris@82 210 xo = ro;
Chris@82 211 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@82 212 V Tp, T13, Tu, TN, Tm, T14, Tv, TY, T7, T17, Ty, TT, Te, T16, Tx;
Chris@82 213 V TQ;
Chris@82 214 {
Chris@82 215 V Tn, To, TM, Ts, Tt, TL;
Chris@82 216 Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 217 To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 218 TM = VADD(Tn, To);
Chris@82 219 Ts = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 220 Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 221 TL = VADD(Ts, Tt);
Chris@82 222 Tp = VSUB(Tn, To);
Chris@82 223 T13 = VADD(TL, TM);
Chris@82 224 Tu = VSUB(Ts, Tt);
Chris@82 225 TN = VSUB(TL, TM);
Chris@82 226 }
Chris@82 227 {
Chris@82 228 V Ti, TW, Tl, TX;
Chris@82 229 {
Chris@82 230 V Tg, Th, Tj, Tk;
Chris@82 231 Tg = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 232 Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 233 Ti = VSUB(Tg, Th);
Chris@82 234 TW = VADD(Tg, Th);
Chris@82 235 Tj = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 236 Tk = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 237 Tl = VSUB(Tj, Tk);
Chris@82 238 TX = VADD(Tj, Tk);
Chris@82 239 }
Chris@82 240 Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
Chris@82 241 T14 = VADD(TX, TW);
Chris@82 242 Tv = VMUL(LDK(KP707106781), VADD(Tl, Ti));
Chris@82 243 TY = VSUB(TW, TX);
Chris@82 244 }
Chris@82 245 {
Chris@82 246 V T3, TR, T6, TS;
Chris@82 247 {
Chris@82 248 V T1, T2, T4, T5;
Chris@82 249 T1 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 250 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 251 T3 = VSUB(T1, T2);
Chris@82 252 TR = VADD(T1, T2);
Chris@82 253 T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 254 T5 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 255 T6 = VSUB(T4, T5);
Chris@82 256 TS = VADD(T4, T5);
Chris@82 257 }
Chris@82 258 T7 = VFNMS(LDK(KP923879532), T6, VMUL(LDK(KP382683432), T3));
Chris@82 259 T17 = VADD(TR, TS);
Chris@82 260 Ty = VFMA(LDK(KP923879532), T3, VMUL(LDK(KP382683432), T6));
Chris@82 261 TT = VSUB(TR, TS);
Chris@82 262 }
Chris@82 263 {
Chris@82 264 V Ta, TO, Td, TP;
Chris@82 265 {
Chris@82 266 V T8, T9, Tb, Tc;
Chris@82 267 T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 268 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 269 Ta = VSUB(T8, T9);
Chris@82 270 TO = VADD(T8, T9);
Chris@82 271 Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 272 Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 273 Td = VSUB(Tb, Tc);
Chris@82 274 TP = VADD(Tb, Tc);
Chris@82 275 }
Chris@82 276 Te = VFMA(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
Chris@82 277 T16 = VADD(TO, TP);
Chris@82 278 Tx = VFNMS(LDK(KP382683432), Td, VMUL(LDK(KP923879532), Ta));
Chris@82 279 TQ = VSUB(TO, TP);
Chris@82 280 }
Chris@82 281 {
Chris@82 282 V T15, T18, T19, T1a;
Chris@82 283 T15 = VADD(T13, T14);
Chris@82 284 T18 = VADD(T16, T17);
Chris@82 285 ST(&(xo[WS(os, 8)]), VSUB(T15, T18), ovs, &(xo[0]));
Chris@82 286 ST(&(xo[0]), VADD(T15, T18), ovs, &(xo[0]));
Chris@82 287 T19 = VSUB(T13, T14);
Chris@82 288 T1a = VBYI(VSUB(T17, T16));
Chris@82 289 ST(&(xo[WS(os, 12)]), VSUB(T19, T1a), ovs, &(xo[0]));
Chris@82 290 ST(&(xo[WS(os, 4)]), VADD(T19, T1a), ovs, &(xo[0]));
Chris@82 291 }
Chris@82 292 {
Chris@82 293 V TV, T11, T10, T12, TU, TZ;
Chris@82 294 TU = VMUL(LDK(KP707106781), VADD(TQ, TT));
Chris@82 295 TV = VADD(TN, TU);
Chris@82 296 T11 = VSUB(TN, TU);
Chris@82 297 TZ = VMUL(LDK(KP707106781), VSUB(TT, TQ));
Chris@82 298 T10 = VBYI(VADD(TY, TZ));
Chris@82 299 T12 = VBYI(VSUB(TZ, TY));
Chris@82 300 ST(&(xo[WS(os, 14)]), VSUB(TV, T10), ovs, &(xo[0]));
Chris@82 301 ST(&(xo[WS(os, 6)]), VADD(T11, T12), ovs, &(xo[0]));
Chris@82 302 ST(&(xo[WS(os, 2)]), VADD(TV, T10), ovs, &(xo[0]));
Chris@82 303 ST(&(xo[WS(os, 10)]), VSUB(T11, T12), ovs, &(xo[0]));
Chris@82 304 }
Chris@82 305 {
Chris@82 306 V Tr, TB, TA, TC;
Chris@82 307 {
Chris@82 308 V Tf, Tq, Tw, Tz;
Chris@82 309 Tf = VSUB(T7, Te);
Chris@82 310 Tq = VSUB(Tm, Tp);
Chris@82 311 Tr = VBYI(VSUB(Tf, Tq));
Chris@82 312 TB = VBYI(VADD(Tq, Tf));
Chris@82 313 Tw = VADD(Tu, Tv);
Chris@82 314 Tz = VADD(Tx, Ty);
Chris@82 315 TA = VSUB(Tw, Tz);
Chris@82 316 TC = VADD(Tw, Tz);
Chris@82 317 }
Chris@82 318 ST(&(xo[WS(os, 7)]), VADD(Tr, TA), ovs, &(xo[WS(os, 1)]));
Chris@82 319 ST(&(xo[WS(os, 15)]), VSUB(TC, TB), ovs, &(xo[WS(os, 1)]));
Chris@82 320 ST(&(xo[WS(os, 9)]), VSUB(TA, Tr), ovs, &(xo[WS(os, 1)]));
Chris@82 321 ST(&(xo[WS(os, 1)]), VADD(TB, TC), ovs, &(xo[WS(os, 1)]));
Chris@82 322 }
Chris@82 323 {
Chris@82 324 V TF, TJ, TI, TK;
Chris@82 325 {
Chris@82 326 V TD, TE, TG, TH;
Chris@82 327 TD = VSUB(Tu, Tv);
Chris@82 328 TE = VADD(Te, T7);
Chris@82 329 TF = VADD(TD, TE);
Chris@82 330 TJ = VSUB(TD, TE);
Chris@82 331 TG = VADD(Tp, Tm);
Chris@82 332 TH = VSUB(Ty, Tx);
Chris@82 333 TI = VBYI(VADD(TG, TH));
Chris@82 334 TK = VBYI(VSUB(TH, TG));
Chris@82 335 }
Chris@82 336 ST(&(xo[WS(os, 13)]), VSUB(TF, TI), ovs, &(xo[WS(os, 1)]));
Chris@82 337 ST(&(xo[WS(os, 5)]), VADD(TJ, TK), ovs, &(xo[WS(os, 1)]));
Chris@82 338 ST(&(xo[WS(os, 3)]), VADD(TF, TI), ovs, &(xo[WS(os, 1)]));
Chris@82 339 ST(&(xo[WS(os, 11)]), VSUB(TJ, TK), ovs, &(xo[WS(os, 1)]));
Chris@82 340 }
Chris@82 341 }
Chris@82 342 }
Chris@82 343 VLEAVE();
Chris@82 344 }
Chris@82 345
Chris@82 346 static const kdft_desc desc = { 16, XSIMD_STRING("n1fv_16"), {68, 8, 4, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 347
Chris@82 348 void XSIMD(codelet_n1fv_16) (planner *p) {
Chris@82 349 X(kdft_register) (p, n1fv_16, &desc);
Chris@82 350 }
Chris@82 351
Chris@82 352 #endif