annotate src/fftw-3.3.3/dft/simd/common/n2sv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:37:47 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n2sv_8 -with-ostride 1 -include n2s.h -store-multiple 4 */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 52 FP additions, 8 FP multiplications,
Chris@10 32 * (or, 44 additions, 0 multiplications, 8 fused multiply/add),
Chris@10 33 * 58 stack variables, 1 constants, and 36 memory accesses
Chris@10 34 */
Chris@10 35 #include "n2s.h"
Chris@10 36
Chris@10 37 static void n2sv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 40 {
Chris@10 41 INT i;
Chris@10 42 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@10 43 V TF, TJ, TD, TR, TS, TT, TU, TV, TW, TE, TX, TY, TK, TI, TZ;
Chris@10 44 V T10, T11, T12;
Chris@10 45 {
Chris@10 46 V Tb, Tn, T3, TC, Ti, TB, T6, To, Tl, Tc, Tw, Tx, T8, T9, Tr;
Chris@10 47 V Ts;
Chris@10 48 {
Chris@10 49 V T1, T2, Tg, Th, T4, T5, Tj, Tk;
Chris@10 50 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@10 51 T2 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@10 52 Tg = LD(&(ii[0]), ivs, &(ii[0]));
Chris@10 53 Th = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@10 54 T4 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@10 55 T5 = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@10 56 Tj = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@10 57 Tk = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@10 58 Tb = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@10 59 Tn = VSUB(T1, T2);
Chris@10 60 T3 = VADD(T1, T2);
Chris@10 61 TC = VSUB(Tg, Th);
Chris@10 62 Ti = VADD(Tg, Th);
Chris@10 63 TB = VSUB(T4, T5);
Chris@10 64 T6 = VADD(T4, T5);
Chris@10 65 To = VSUB(Tj, Tk);
Chris@10 66 Tl = VADD(Tj, Tk);
Chris@10 67 Tc = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@10 68 Tw = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@10 69 Tx = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@10 70 T8 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@10 71 T9 = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@10 72 Tr = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@10 73 Ts = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@10 74 }
Chris@10 75 {
Chris@10 76 V TL, T7, TP, Tm, Tz, TH, Te, Tf, TO, TQ, TG, Tu, Tp, TA;
Chris@10 77 {
Chris@10 78 V Td, Tv, TN, Ty, Ta, Tq, TM, Tt;
Chris@10 79 TL = VSUB(T3, T6);
Chris@10 80 T7 = VADD(T3, T6);
Chris@10 81 Td = VADD(Tb, Tc);
Chris@10 82 Tv = VSUB(Tb, Tc);
Chris@10 83 TN = VADD(Tw, Tx);
Chris@10 84 Ty = VSUB(Tw, Tx);
Chris@10 85 Ta = VADD(T8, T9);
Chris@10 86 Tq = VSUB(T8, T9);
Chris@10 87 TM = VADD(Tr, Ts);
Chris@10 88 Tt = VSUB(Tr, Ts);
Chris@10 89 TP = VADD(Ti, Tl);
Chris@10 90 Tm = VSUB(Ti, Tl);
Chris@10 91 Tz = VSUB(Tv, Ty);
Chris@10 92 TH = VADD(Tv, Ty);
Chris@10 93 Te = VADD(Ta, Td);
Chris@10 94 Tf = VSUB(Td, Ta);
Chris@10 95 TO = VSUB(TM, TN);
Chris@10 96 TQ = VADD(TM, TN);
Chris@10 97 TG = VSUB(Tt, Tq);
Chris@10 98 Tu = VADD(Tq, Tt);
Chris@10 99 }
Chris@10 100 TF = VSUB(Tn, To);
Chris@10 101 Tp = VADD(Tn, To);
Chris@10 102 TJ = VSUB(TC, TB);
Chris@10 103 TD = VADD(TB, TC);
Chris@10 104 TR = VSUB(Tm, Tf);
Chris@10 105 STM4(&(io[6]), TR, ovs, &(io[0]));
Chris@10 106 TS = VADD(Tf, Tm);
Chris@10 107 STM4(&(io[2]), TS, ovs, &(io[0]));
Chris@10 108 TT = VADD(T7, Te);
Chris@10 109 STM4(&(ro[0]), TT, ovs, &(ro[0]));
Chris@10 110 TU = VSUB(T7, Te);
Chris@10 111 STM4(&(ro[4]), TU, ovs, &(ro[0]));
Chris@10 112 TV = VADD(TP, TQ);
Chris@10 113 STM4(&(io[0]), TV, ovs, &(io[0]));
Chris@10 114 TW = VSUB(TP, TQ);
Chris@10 115 STM4(&(io[4]), TW, ovs, &(io[0]));
Chris@10 116 TE = VSUB(Tz, Tu);
Chris@10 117 TA = VADD(Tu, Tz);
Chris@10 118 TX = VADD(TL, TO);
Chris@10 119 STM4(&(ro[2]), TX, ovs, &(ro[0]));
Chris@10 120 TY = VSUB(TL, TO);
Chris@10 121 STM4(&(ro[6]), TY, ovs, &(ro[0]));
Chris@10 122 TK = VADD(TG, TH);
Chris@10 123 TI = VSUB(TG, TH);
Chris@10 124 TZ = VFMA(LDK(KP707106781), TA, Tp);
Chris@10 125 STM4(&(ro[1]), TZ, ovs, &(ro[1]));
Chris@10 126 T10 = VFNMS(LDK(KP707106781), TA, Tp);
Chris@10 127 STM4(&(ro[5]), T10, ovs, &(ro[1]));
Chris@10 128 }
Chris@10 129 }
Chris@10 130 T11 = VFMA(LDK(KP707106781), TK, TJ);
Chris@10 131 STM4(&(io[1]), T11, ovs, &(io[1]));
Chris@10 132 T12 = VFNMS(LDK(KP707106781), TK, TJ);
Chris@10 133 STM4(&(io[5]), T12, ovs, &(io[1]));
Chris@10 134 {
Chris@10 135 V T13, T14, T15, T16;
Chris@10 136 T13 = VFMA(LDK(KP707106781), TE, TD);
Chris@10 137 STM4(&(io[3]), T13, ovs, &(io[1]));
Chris@10 138 STN4(&(io[0]), TV, T11, TS, T13, ovs);
Chris@10 139 T14 = VFNMS(LDK(KP707106781), TE, TD);
Chris@10 140 STM4(&(io[7]), T14, ovs, &(io[1]));
Chris@10 141 STN4(&(io[4]), TW, T12, TR, T14, ovs);
Chris@10 142 T15 = VFMA(LDK(KP707106781), TI, TF);
Chris@10 143 STM4(&(ro[3]), T15, ovs, &(ro[1]));
Chris@10 144 STN4(&(ro[0]), TT, TZ, TX, T15, ovs);
Chris@10 145 T16 = VFNMS(LDK(KP707106781), TI, TF);
Chris@10 146 STM4(&(ro[7]), T16, ovs, &(ro[1]));
Chris@10 147 STN4(&(ro[4]), TU, T10, TY, T16, ovs);
Chris@10 148 }
Chris@10 149 }
Chris@10 150 }
Chris@10 151 VLEAVE();
Chris@10 152 }
Chris@10 153
Chris@10 154 static const kdft_desc desc = { 8, XSIMD_STRING("n2sv_8"), {44, 0, 8, 0}, &GENUS, 0, 1, 0, 0 };
Chris@10 155
Chris@10 156 void XSIMD(codelet_n2sv_8) (planner *p) {
Chris@10 157 X(kdft_register) (p, n2sv_8, &desc);
Chris@10 158 }
Chris@10 159
Chris@10 160 #else /* HAVE_FMA */
Chris@10 161
Chris@10 162 /* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -name n2sv_8 -with-ostride 1 -include n2s.h -store-multiple 4 */
Chris@10 163
Chris@10 164 /*
Chris@10 165 * This function contains 52 FP additions, 4 FP multiplications,
Chris@10 166 * (or, 52 additions, 4 multiplications, 0 fused multiply/add),
Chris@10 167 * 34 stack variables, 1 constants, and 36 memory accesses
Chris@10 168 */
Chris@10 169 #include "n2s.h"
Chris@10 170
Chris@10 171 static void n2sv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 172 {
Chris@10 173 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 174 {
Chris@10 175 INT i;
Chris@10 176 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@10 177 V T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
Chris@10 178 V TG;
Chris@10 179 {
Chris@10 180 V T1, T2, Tj, Tk;
Chris@10 181 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@10 182 T2 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@10 183 T3 = VADD(T1, T2);
Chris@10 184 Tn = VSUB(T1, T2);
Chris@10 185 {
Chris@10 186 V Tg, Th, T4, T5;
Chris@10 187 Tg = LD(&(ii[0]), ivs, &(ii[0]));
Chris@10 188 Th = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@10 189 Ti = VADD(Tg, Th);
Chris@10 190 TC = VSUB(Tg, Th);
Chris@10 191 T4 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@10 192 T5 = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@10 193 T6 = VADD(T4, T5);
Chris@10 194 TB = VSUB(T4, T5);
Chris@10 195 }
Chris@10 196 Tj = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@10 197 Tk = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@10 198 Tl = VADD(Tj, Tk);
Chris@10 199 To = VSUB(Tj, Tk);
Chris@10 200 {
Chris@10 201 V Tb, Tc, Tv, Tw, Tx, Ty;
Chris@10 202 Tb = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@10 203 Tc = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@10 204 Tv = VSUB(Tb, Tc);
Chris@10 205 Tw = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@10 206 Tx = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@10 207 Ty = VSUB(Tw, Tx);
Chris@10 208 Td = VADD(Tb, Tc);
Chris@10 209 TN = VADD(Tw, Tx);
Chris@10 210 Tz = VSUB(Tv, Ty);
Chris@10 211 TH = VADD(Tv, Ty);
Chris@10 212 }
Chris@10 213 {
Chris@10 214 V T8, T9, Tq, Tr, Ts, Tt;
Chris@10 215 T8 = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@10 216 T9 = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@10 217 Tq = VSUB(T8, T9);
Chris@10 218 Tr = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@10 219 Ts = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@10 220 Tt = VSUB(Tr, Ts);
Chris@10 221 Ta = VADD(T8, T9);
Chris@10 222 TM = VADD(Tr, Ts);
Chris@10 223 Tu = VADD(Tq, Tt);
Chris@10 224 TG = VSUB(Tt, Tq);
Chris@10 225 }
Chris@10 226 }
Chris@10 227 {
Chris@10 228 V TR, TS, TT, TU, TV, TW, TX, TY;
Chris@10 229 {
Chris@10 230 V T7, Te, TP, TQ;
Chris@10 231 T7 = VADD(T3, T6);
Chris@10 232 Te = VADD(Ta, Td);
Chris@10 233 TR = VSUB(T7, Te);
Chris@10 234 STM4(&(ro[4]), TR, ovs, &(ro[0]));
Chris@10 235 TS = VADD(T7, Te);
Chris@10 236 STM4(&(ro[0]), TS, ovs, &(ro[0]));
Chris@10 237 TP = VADD(Ti, Tl);
Chris@10 238 TQ = VADD(TM, TN);
Chris@10 239 TT = VSUB(TP, TQ);
Chris@10 240 STM4(&(io[4]), TT, ovs, &(io[0]));
Chris@10 241 TU = VADD(TP, TQ);
Chris@10 242 STM4(&(io[0]), TU, ovs, &(io[0]));
Chris@10 243 }
Chris@10 244 {
Chris@10 245 V Tf, Tm, TL, TO;
Chris@10 246 Tf = VSUB(Td, Ta);
Chris@10 247 Tm = VSUB(Ti, Tl);
Chris@10 248 TV = VADD(Tf, Tm);
Chris@10 249 STM4(&(io[2]), TV, ovs, &(io[0]));
Chris@10 250 TW = VSUB(Tm, Tf);
Chris@10 251 STM4(&(io[6]), TW, ovs, &(io[0]));
Chris@10 252 TL = VSUB(T3, T6);
Chris@10 253 TO = VSUB(TM, TN);
Chris@10 254 TX = VSUB(TL, TO);
Chris@10 255 STM4(&(ro[6]), TX, ovs, &(ro[0]));
Chris@10 256 TY = VADD(TL, TO);
Chris@10 257 STM4(&(ro[2]), TY, ovs, &(ro[0]));
Chris@10 258 }
Chris@10 259 {
Chris@10 260 V TZ, T10, T11, T12;
Chris@10 261 {
Chris@10 262 V Tp, TA, TJ, TK;
Chris@10 263 Tp = VADD(Tn, To);
Chris@10 264 TA = VMUL(LDK(KP707106781), VADD(Tu, Tz));
Chris@10 265 TZ = VSUB(Tp, TA);
Chris@10 266 STM4(&(ro[5]), TZ, ovs, &(ro[1]));
Chris@10 267 T10 = VADD(Tp, TA);
Chris@10 268 STM4(&(ro[1]), T10, ovs, &(ro[1]));
Chris@10 269 TJ = VSUB(TC, TB);
Chris@10 270 TK = VMUL(LDK(KP707106781), VADD(TG, TH));
Chris@10 271 T11 = VSUB(TJ, TK);
Chris@10 272 STM4(&(io[5]), T11, ovs, &(io[1]));
Chris@10 273 T12 = VADD(TJ, TK);
Chris@10 274 STM4(&(io[1]), T12, ovs, &(io[1]));
Chris@10 275 }
Chris@10 276 {
Chris@10 277 V TD, TE, T13, T14;
Chris@10 278 TD = VADD(TB, TC);
Chris@10 279 TE = VMUL(LDK(KP707106781), VSUB(Tz, Tu));
Chris@10 280 T13 = VSUB(TD, TE);
Chris@10 281 STM4(&(io[7]), T13, ovs, &(io[1]));
Chris@10 282 STN4(&(io[4]), TT, T11, TW, T13, ovs);
Chris@10 283 T14 = VADD(TD, TE);
Chris@10 284 STM4(&(io[3]), T14, ovs, &(io[1]));
Chris@10 285 STN4(&(io[0]), TU, T12, TV, T14, ovs);
Chris@10 286 }
Chris@10 287 {
Chris@10 288 V TF, TI, T15, T16;
Chris@10 289 TF = VSUB(Tn, To);
Chris@10 290 TI = VMUL(LDK(KP707106781), VSUB(TG, TH));
Chris@10 291 T15 = VSUB(TF, TI);
Chris@10 292 STM4(&(ro[7]), T15, ovs, &(ro[1]));
Chris@10 293 STN4(&(ro[4]), TR, TZ, TX, T15, ovs);
Chris@10 294 T16 = VADD(TF, TI);
Chris@10 295 STM4(&(ro[3]), T16, ovs, &(ro[1]));
Chris@10 296 STN4(&(ro[0]), TS, T10, TY, T16, ovs);
Chris@10 297 }
Chris@10 298 }
Chris@10 299 }
Chris@10 300 }
Chris@10 301 }
Chris@10 302 VLEAVE();
Chris@10 303 }
Chris@10 304
Chris@10 305 static const kdft_desc desc = { 8, XSIMD_STRING("n2sv_8"), {52, 4, 0, 0}, &GENUS, 0, 1, 0, 0 };
Chris@10 306
Chris@10 307 void XSIMD(codelet_n2sv_8) (planner *p) {
Chris@10 308 X(kdft_register) (p, n2sv_8, &desc);
Chris@10 309 }
Chris@10 310
Chris@10 311 #endif /* HAVE_FMA */