annotate src/fftw-3.3.8/dft/simd/common/n2sv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:19 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n2sv_8 -with-ostride 1 -include dft/simd/n2s.h -store-multiple 4 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 52 FP additions, 8 FP multiplications,
Chris@82 32 * (or, 44 additions, 0 multiplications, 8 fused multiply/add),
Chris@82 33 * 34 stack variables, 1 constants, and 36 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2s.h"
Chris@82 36
Chris@82 37 static void n2sv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT i;
Chris@82 42 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@82 43 V T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
Chris@82 44 V TG;
Chris@82 45 {
Chris@82 46 V T1, T2, Tj, Tk;
Chris@82 47 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@82 48 T2 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@82 49 T3 = VADD(T1, T2);
Chris@82 50 Tn = VSUB(T1, T2);
Chris@82 51 {
Chris@82 52 V Tg, Th, T4, T5;
Chris@82 53 Tg = LD(&(ii[0]), ivs, &(ii[0]));
Chris@82 54 Th = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@82 55 Ti = VADD(Tg, Th);
Chris@82 56 TC = VSUB(Tg, Th);
Chris@82 57 T4 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@82 58 T5 = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@82 59 T6 = VADD(T4, T5);
Chris@82 60 TB = VSUB(T4, T5);
Chris@82 61 }
Chris@82 62 Tj = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@82 63 Tk = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@82 64 Tl = VADD(Tj, Tk);
Chris@82 65 To = VSUB(Tj, Tk);
Chris@82 66 {
Chris@82 67 V Tb, Tc, Tv, Tw, Tx, Ty;
Chris@82 68 Tb = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@82 69 Tc = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@82 70 Tv = VSUB(Tb, Tc);
Chris@82 71 Tw = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@82 72 Tx = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@82 73 Ty = VSUB(Tw, Tx);
Chris@82 74 Td = VADD(Tb, Tc);
Chris@82 75 TN = VADD(Tw, Tx);
Chris@82 76 Tz = VSUB(Tv, Ty);
Chris@82 77 TH = VADD(Tv, Ty);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V T8, T9, Tq, Tr, Ts, Tt;
Chris@82 81 T8 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@82 82 T9 = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@82 83 Tq = VSUB(T8, T9);
Chris@82 84 Tr = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@82 85 Ts = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@82 86 Tt = VSUB(Tr, Ts);
Chris@82 87 Ta = VADD(T8, T9);
Chris@82 88 TM = VADD(Tr, Ts);
Chris@82 89 Tu = VADD(Tq, Tt);
Chris@82 90 TG = VSUB(Tt, Tq);
Chris@82 91 }
Chris@82 92 }
Chris@82 93 {
Chris@82 94 V TR, TS, TT, TU, TV, TW, TX, TY;
Chris@82 95 {
Chris@82 96 V T7, Te, TP, TQ;
Chris@82 97 T7 = VADD(T3, T6);
Chris@82 98 Te = VADD(Ta, Td);
Chris@82 99 TR = VSUB(T7, Te);
Chris@82 100 STM4(&(ro[4]), TR, ovs, &(ro[0]));
Chris@82 101 TS = VADD(T7, Te);
Chris@82 102 STM4(&(ro[0]), TS, ovs, &(ro[0]));
Chris@82 103 TP = VADD(Ti, Tl);
Chris@82 104 TQ = VADD(TM, TN);
Chris@82 105 TT = VSUB(TP, TQ);
Chris@82 106 STM4(&(io[4]), TT, ovs, &(io[0]));
Chris@82 107 TU = VADD(TP, TQ);
Chris@82 108 STM4(&(io[0]), TU, ovs, &(io[0]));
Chris@82 109 }
Chris@82 110 {
Chris@82 111 V Tf, Tm, TL, TO;
Chris@82 112 Tf = VSUB(Td, Ta);
Chris@82 113 Tm = VSUB(Ti, Tl);
Chris@82 114 TV = VADD(Tf, Tm);
Chris@82 115 STM4(&(io[2]), TV, ovs, &(io[0]));
Chris@82 116 TW = VSUB(Tm, Tf);
Chris@82 117 STM4(&(io[6]), TW, ovs, &(io[0]));
Chris@82 118 TL = VSUB(T3, T6);
Chris@82 119 TO = VSUB(TM, TN);
Chris@82 120 TX = VSUB(TL, TO);
Chris@82 121 STM4(&(ro[6]), TX, ovs, &(ro[0]));
Chris@82 122 TY = VADD(TL, TO);
Chris@82 123 STM4(&(ro[2]), TY, ovs, &(ro[0]));
Chris@82 124 }
Chris@82 125 {
Chris@82 126 V TZ, T10, T11, T12;
Chris@82 127 {
Chris@82 128 V Tp, TA, TJ, TK;
Chris@82 129 Tp = VADD(Tn, To);
Chris@82 130 TA = VADD(Tu, Tz);
Chris@82 131 TZ = VFNMS(LDK(KP707106781), TA, Tp);
Chris@82 132 STM4(&(ro[5]), TZ, ovs, &(ro[1]));
Chris@82 133 T10 = VFMA(LDK(KP707106781), TA, Tp);
Chris@82 134 STM4(&(ro[1]), T10, ovs, &(ro[1]));
Chris@82 135 TJ = VSUB(TC, TB);
Chris@82 136 TK = VADD(TG, TH);
Chris@82 137 T11 = VFNMS(LDK(KP707106781), TK, TJ);
Chris@82 138 STM4(&(io[5]), T11, ovs, &(io[1]));
Chris@82 139 T12 = VFMA(LDK(KP707106781), TK, TJ);
Chris@82 140 STM4(&(io[1]), T12, ovs, &(io[1]));
Chris@82 141 }
Chris@82 142 {
Chris@82 143 V TD, TE, T13, T14;
Chris@82 144 TD = VADD(TB, TC);
Chris@82 145 TE = VSUB(Tz, Tu);
Chris@82 146 T13 = VFNMS(LDK(KP707106781), TE, TD);
Chris@82 147 STM4(&(io[7]), T13, ovs, &(io[1]));
Chris@82 148 STN4(&(io[4]), TT, T11, TW, T13, ovs);
Chris@82 149 T14 = VFMA(LDK(KP707106781), TE, TD);
Chris@82 150 STM4(&(io[3]), T14, ovs, &(io[1]));
Chris@82 151 STN4(&(io[0]), TU, T12, TV, T14, ovs);
Chris@82 152 }
Chris@82 153 {
Chris@82 154 V TF, TI, T15, T16;
Chris@82 155 TF = VSUB(Tn, To);
Chris@82 156 TI = VSUB(TG, TH);
Chris@82 157 T15 = VFNMS(LDK(KP707106781), TI, TF);
Chris@82 158 STM4(&(ro[7]), T15, ovs, &(ro[1]));
Chris@82 159 STN4(&(ro[4]), TR, TZ, TX, T15, ovs);
Chris@82 160 T16 = VFMA(LDK(KP707106781), TI, TF);
Chris@82 161 STM4(&(ro[3]), T16, ovs, &(ro[1]));
Chris@82 162 STN4(&(ro[0]), TS, T10, TY, T16, ovs);
Chris@82 163 }
Chris@82 164 }
Chris@82 165 }
Chris@82 166 }
Chris@82 167 }
Chris@82 168 VLEAVE();
Chris@82 169 }
Chris@82 170
Chris@82 171 static const kdft_desc desc = { 8, XSIMD_STRING("n2sv_8"), {44, 0, 8, 0}, &GENUS, 0, 1, 0, 0 };
Chris@82 172
Chris@82 173 void XSIMD(codelet_n2sv_8) (planner *p) {
Chris@82 174 X(kdft_register) (p, n2sv_8, &desc);
Chris@82 175 }
Chris@82 176
Chris@82 177 #else
Chris@82 178
Chris@82 179 /* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n2sv_8 -with-ostride 1 -include dft/simd/n2s.h -store-multiple 4 */
Chris@82 180
Chris@82 181 /*
Chris@82 182 * This function contains 52 FP additions, 4 FP multiplications,
Chris@82 183 * (or, 52 additions, 4 multiplications, 0 fused multiply/add),
Chris@82 184 * 34 stack variables, 1 constants, and 36 memory accesses
Chris@82 185 */
Chris@82 186 #include "dft/simd/n2s.h"
Chris@82 187
Chris@82 188 static void n2sv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 189 {
Chris@82 190 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 191 {
Chris@82 192 INT i;
Chris@82 193 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@82 194 V T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
Chris@82 195 V TG;
Chris@82 196 {
Chris@82 197 V T1, T2, Tj, Tk;
Chris@82 198 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@82 199 T2 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@82 200 T3 = VADD(T1, T2);
Chris@82 201 Tn = VSUB(T1, T2);
Chris@82 202 {
Chris@82 203 V Tg, Th, T4, T5;
Chris@82 204 Tg = LD(&(ii[0]), ivs, &(ii[0]));
Chris@82 205 Th = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@82 206 Ti = VADD(Tg, Th);
Chris@82 207 TC = VSUB(Tg, Th);
Chris@82 208 T4 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@82 209 T5 = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@82 210 T6 = VADD(T4, T5);
Chris@82 211 TB = VSUB(T4, T5);
Chris@82 212 }
Chris@82 213 Tj = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@82 214 Tk = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@82 215 Tl = VADD(Tj, Tk);
Chris@82 216 To = VSUB(Tj, Tk);
Chris@82 217 {
Chris@82 218 V Tb, Tc, Tv, Tw, Tx, Ty;
Chris@82 219 Tb = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@82 220 Tc = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@82 221 Tv = VSUB(Tb, Tc);
Chris@82 222 Tw = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@82 223 Tx = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@82 224 Ty = VSUB(Tw, Tx);
Chris@82 225 Td = VADD(Tb, Tc);
Chris@82 226 TN = VADD(Tw, Tx);
Chris@82 227 Tz = VSUB(Tv, Ty);
Chris@82 228 TH = VADD(Tv, Ty);
Chris@82 229 }
Chris@82 230 {
Chris@82 231 V T8, T9, Tq, Tr, Ts, Tt;
Chris@82 232 T8 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@82 233 T9 = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@82 234 Tq = VSUB(T8, T9);
Chris@82 235 Tr = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@82 236 Ts = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@82 237 Tt = VSUB(Tr, Ts);
Chris@82 238 Ta = VADD(T8, T9);
Chris@82 239 TM = VADD(Tr, Ts);
Chris@82 240 Tu = VADD(Tq, Tt);
Chris@82 241 TG = VSUB(Tt, Tq);
Chris@82 242 }
Chris@82 243 }
Chris@82 244 {
Chris@82 245 V TR, TS, TT, TU, TV, TW, TX, TY;
Chris@82 246 {
Chris@82 247 V T7, Te, TP, TQ;
Chris@82 248 T7 = VADD(T3, T6);
Chris@82 249 Te = VADD(Ta, Td);
Chris@82 250 TR = VSUB(T7, Te);
Chris@82 251 STM4(&(ro[4]), TR, ovs, &(ro[0]));
Chris@82 252 TS = VADD(T7, Te);
Chris@82 253 STM4(&(ro[0]), TS, ovs, &(ro[0]));
Chris@82 254 TP = VADD(Ti, Tl);
Chris@82 255 TQ = VADD(TM, TN);
Chris@82 256 TT = VSUB(TP, TQ);
Chris@82 257 STM4(&(io[4]), TT, ovs, &(io[0]));
Chris@82 258 TU = VADD(TP, TQ);
Chris@82 259 STM4(&(io[0]), TU, ovs, &(io[0]));
Chris@82 260 }
Chris@82 261 {
Chris@82 262 V Tf, Tm, TL, TO;
Chris@82 263 Tf = VSUB(Td, Ta);
Chris@82 264 Tm = VSUB(Ti, Tl);
Chris@82 265 TV = VADD(Tf, Tm);
Chris@82 266 STM4(&(io[2]), TV, ovs, &(io[0]));
Chris@82 267 TW = VSUB(Tm, Tf);
Chris@82 268 STM4(&(io[6]), TW, ovs, &(io[0]));
Chris@82 269 TL = VSUB(T3, T6);
Chris@82 270 TO = VSUB(TM, TN);
Chris@82 271 TX = VSUB(TL, TO);
Chris@82 272 STM4(&(ro[6]), TX, ovs, &(ro[0]));
Chris@82 273 TY = VADD(TL, TO);
Chris@82 274 STM4(&(ro[2]), TY, ovs, &(ro[0]));
Chris@82 275 }
Chris@82 276 {
Chris@82 277 V TZ, T10, T11, T12;
Chris@82 278 {
Chris@82 279 V Tp, TA, TJ, TK;
Chris@82 280 Tp = VADD(Tn, To);
Chris@82 281 TA = VMUL(LDK(KP707106781), VADD(Tu, Tz));
Chris@82 282 TZ = VSUB(Tp, TA);
Chris@82 283 STM4(&(ro[5]), TZ, ovs, &(ro[1]));
Chris@82 284 T10 = VADD(Tp, TA);
Chris@82 285 STM4(&(ro[1]), T10, ovs, &(ro[1]));
Chris@82 286 TJ = VSUB(TC, TB);
Chris@82 287 TK = VMUL(LDK(KP707106781), VADD(TG, TH));
Chris@82 288 T11 = VSUB(TJ, TK);
Chris@82 289 STM4(&(io[5]), T11, ovs, &(io[1]));
Chris@82 290 T12 = VADD(TJ, TK);
Chris@82 291 STM4(&(io[1]), T12, ovs, &(io[1]));
Chris@82 292 }
Chris@82 293 {
Chris@82 294 V TD, TE, T13, T14;
Chris@82 295 TD = VADD(TB, TC);
Chris@82 296 TE = VMUL(LDK(KP707106781), VSUB(Tz, Tu));
Chris@82 297 T13 = VSUB(TD, TE);
Chris@82 298 STM4(&(io[7]), T13, ovs, &(io[1]));
Chris@82 299 STN4(&(io[4]), TT, T11, TW, T13, ovs);
Chris@82 300 T14 = VADD(TD, TE);
Chris@82 301 STM4(&(io[3]), T14, ovs, &(io[1]));
Chris@82 302 STN4(&(io[0]), TU, T12, TV, T14, ovs);
Chris@82 303 }
Chris@82 304 {
Chris@82 305 V TF, TI, T15, T16;
Chris@82 306 TF = VSUB(Tn, To);
Chris@82 307 TI = VMUL(LDK(KP707106781), VSUB(TG, TH));
Chris@82 308 T15 = VSUB(TF, TI);
Chris@82 309 STM4(&(ro[7]), T15, ovs, &(ro[1]));
Chris@82 310 STN4(&(ro[4]), TR, TZ, TX, T15, ovs);
Chris@82 311 T16 = VADD(TF, TI);
Chris@82 312 STM4(&(ro[3]), T16, ovs, &(ro[1]));
Chris@82 313 STN4(&(ro[0]), TS, T10, TY, T16, ovs);
Chris@82 314 }
Chris@82 315 }
Chris@82 316 }
Chris@82 317 }
Chris@82 318 }
Chris@82 319 VLEAVE();
Chris@82 320 }
Chris@82 321
Chris@82 322 static const kdft_desc desc = { 8, XSIMD_STRING("n2sv_8"), {52, 4, 0, 0}, &GENUS, 0, 1, 0, 0 };
Chris@82 323
Chris@82 324 void XSIMD(codelet_n2sv_8) (planner *p) {
Chris@82 325 X(kdft_register) (p, n2sv_8, &desc);
Chris@82 326 }
Chris@82 327
Chris@82 328 #endif