annotate src/fftw-3.3.8/dft/simd/common/n2sv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:19 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2sv_16 -with-ostride 1 -include dft/simd/n2s.h -store-multiple 4 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 144 FP additions, 40 FP multiplications,
Chris@82 32 * (or, 104 additions, 0 multiplications, 40 fused multiply/add),
Chris@82 33 * 74 stack variables, 3 constants, and 72 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2s.h"
Chris@82 36
Chris@82 37 static void n2sv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 {
Chris@82 43 INT i;
Chris@82 44 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@82 45 V T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
Chris@82 46 V T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
Chris@82 47 V T1U, T1A;
Chris@82 48 {
Chris@82 49 V T3, TL, Ty, T1k, T6, T1j, TB, TM;
Chris@82 50 {
Chris@82 51 V T1, T2, Tw, Tx;
Chris@82 52 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@82 53 T2 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
Chris@82 54 T3 = VADD(T1, T2);
Chris@82 55 TL = VSUB(T1, T2);
Chris@82 56 Tw = LD(&(ii[0]), ivs, &(ii[0]));
Chris@82 57 Tx = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
Chris@82 58 Ty = VADD(Tw, Tx);
Chris@82 59 T1k = VSUB(Tw, Tx);
Chris@82 60 }
Chris@82 61 {
Chris@82 62 V T4, T5, Tz, TA;
Chris@82 63 T4 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@82 64 T5 = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
Chris@82 65 T6 = VADD(T4, T5);
Chris@82 66 T1j = VSUB(T4, T5);
Chris@82 67 Tz = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@82 68 TA = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
Chris@82 69 TB = VADD(Tz, TA);
Chris@82 70 TM = VSUB(Tz, TA);
Chris@82 71 }
Chris@82 72 T7 = VADD(T3, T6);
Chris@82 73 T1R = VSUB(T3, T6);
Chris@82 74 T25 = VSUB(Ty, TB);
Chris@82 75 TC = VADD(Ty, TB);
Chris@82 76 TN = VSUB(TL, TM);
Chris@82 77 T1x = VADD(TL, TM);
Chris@82 78 T1H = VSUB(T1k, T1j);
Chris@82 79 T1l = VADD(T1j, T1k);
Chris@82 80 }
Chris@82 81 {
Chris@82 82 V Tp, T1c, T1a, T20, Ts, T17, T1f, T21;
Chris@82 83 {
Chris@82 84 V Tn, To, T18, T19;
Chris@82 85 Tn = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
Chris@82 86 To = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@82 87 Tp = VADD(Tn, To);
Chris@82 88 T1c = VSUB(Tn, To);
Chris@82 89 T18 = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
Chris@82 90 T19 = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@82 91 T1a = VSUB(T18, T19);
Chris@82 92 T20 = VADD(T18, T19);
Chris@82 93 }
Chris@82 94 {
Chris@82 95 V Tq, Tr, T1d, T1e;
Chris@82 96 Tq = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@82 97 Tr = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
Chris@82 98 Ts = VADD(Tq, Tr);
Chris@82 99 T17 = VSUB(Tq, Tr);
Chris@82 100 T1d = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@82 101 T1e = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
Chris@82 102 T1f = VSUB(T1d, T1e);
Chris@82 103 T21 = VADD(T1d, T1e);
Chris@82 104 }
Chris@82 105 Tt = VADD(Tp, Ts);
Chris@82 106 T22 = VSUB(T20, T21);
Chris@82 107 T2h = VADD(T20, T21);
Chris@82 108 T1b = VADD(T17, T1a);
Chris@82 109 T1g = VSUB(T1c, T1f);
Chris@82 110 T1E = VSUB(T1a, T17);
Chris@82 111 T1Z = VSUB(Tp, Ts);
Chris@82 112 T1D = VADD(T1c, T1f);
Chris@82 113 }
Chris@82 114 {
Chris@82 115 V Ta, TP, TF, TO, Td, TR, TI, TS;
Chris@82 116 {
Chris@82 117 V T8, T9, TD, TE;
Chris@82 118 T8 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@82 119 T9 = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
Chris@82 120 Ta = VADD(T8, T9);
Chris@82 121 TP = VSUB(T8, T9);
Chris@82 122 TD = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@82 123 TE = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
Chris@82 124 TF = VADD(TD, TE);
Chris@82 125 TO = VSUB(TD, TE);
Chris@82 126 }
Chris@82 127 {
Chris@82 128 V Tb, Tc, TG, TH;
Chris@82 129 Tb = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
Chris@82 130 Tc = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@82 131 Td = VADD(Tb, Tc);
Chris@82 132 TR = VSUB(Tb, Tc);
Chris@82 133 TG = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
Chris@82 134 TH = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@82 135 TI = VADD(TG, TH);
Chris@82 136 TS = VSUB(TG, TH);
Chris@82 137 }
Chris@82 138 Te = VADD(Ta, Td);
Chris@82 139 T1S = VSUB(TF, TI);
Chris@82 140 T26 = VSUB(Td, Ta);
Chris@82 141 TJ = VADD(TF, TI);
Chris@82 142 TQ = VSUB(TO, TP);
Chris@82 143 T1m = VSUB(TR, TS);
Chris@82 144 T1n = VADD(TP, TO);
Chris@82 145 TT = VADD(TR, TS);
Chris@82 146 }
Chris@82 147 {
Chris@82 148 V Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
Chris@82 149 {
Chris@82 150 V Tg, Th, TX, TY;
Chris@82 151 Tg = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@82 152 Th = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
Chris@82 153 Ti = VADD(Tg, Th);
Chris@82 154 T11 = VSUB(Tg, Th);
Chris@82 155 TX = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@82 156 TY = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
Chris@82 157 TZ = VSUB(TX, TY);
Chris@82 158 T1V = VADD(TX, TY);
Chris@82 159 }
Chris@82 160 {
Chris@82 161 V Tj, Tk, T12, T13;
Chris@82 162 Tj = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@82 163 Tk = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
Chris@82 164 Tl = VADD(Tj, Tk);
Chris@82 165 TW = VSUB(Tj, Tk);
Chris@82 166 T12 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@82 167 T13 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
Chris@82 168 T14 = VSUB(T12, T13);
Chris@82 169 T1W = VADD(T12, T13);
Chris@82 170 }
Chris@82 171 Tm = VADD(Ti, Tl);
Chris@82 172 T1X = VSUB(T1V, T1W);
Chris@82 173 T2g = VADD(T1V, T1W);
Chris@82 174 T10 = VADD(TW, TZ);
Chris@82 175 T15 = VSUB(T11, T14);
Chris@82 176 T1B = VSUB(TZ, TW);
Chris@82 177 T1U = VSUB(Ti, Tl);
Chris@82 178 T1A = VADD(T11, T14);
Chris@82 179 }
Chris@82 180 {
Chris@82 181 V T2l, T2m, T2n, T2o, T2p, T2q, T2r, T2s;
Chris@82 182 {
Chris@82 183 V Tf, Tu, T2j, T2k;
Chris@82 184 Tf = VADD(T7, Te);
Chris@82 185 Tu = VADD(Tm, Tt);
Chris@82 186 T2l = VSUB(Tf, Tu);
Chris@82 187 STM4(&(ro[8]), T2l, ovs, &(ro[0]));
Chris@82 188 T2m = VADD(Tf, Tu);
Chris@82 189 STM4(&(ro[0]), T2m, ovs, &(ro[0]));
Chris@82 190 T2j = VADD(TC, TJ);
Chris@82 191 T2k = VADD(T2g, T2h);
Chris@82 192 T2n = VSUB(T2j, T2k);
Chris@82 193 STM4(&(io[8]), T2n, ovs, &(io[0]));
Chris@82 194 T2o = VADD(T2j, T2k);
Chris@82 195 STM4(&(io[0]), T2o, ovs, &(io[0]));
Chris@82 196 }
Chris@82 197 {
Chris@82 198 V Tv, TK, T2f, T2i;
Chris@82 199 Tv = VSUB(Tt, Tm);
Chris@82 200 TK = VSUB(TC, TJ);
Chris@82 201 T2p = VADD(Tv, TK);
Chris@82 202 STM4(&(io[4]), T2p, ovs, &(io[0]));
Chris@82 203 T2q = VSUB(TK, Tv);
Chris@82 204 STM4(&(io[12]), T2q, ovs, &(io[0]));
Chris@82 205 T2f = VSUB(T7, Te);
Chris@82 206 T2i = VSUB(T2g, T2h);
Chris@82 207 T2r = VSUB(T2f, T2i);
Chris@82 208 STM4(&(ro[12]), T2r, ovs, &(ro[0]));
Chris@82 209 T2s = VADD(T2f, T2i);
Chris@82 210 STM4(&(ro[4]), T2s, ovs, &(ro[0]));
Chris@82 211 }
Chris@82 212 {
Chris@82 213 V T2t, T2u, T2v, T2w, T2x, T2y, T2z, T2A;
Chris@82 214 {
Chris@82 215 V T1T, T27, T24, T28, T1Y, T23;
Chris@82 216 T1T = VADD(T1R, T1S);
Chris@82 217 T27 = VSUB(T25, T26);
Chris@82 218 T1Y = VADD(T1U, T1X);
Chris@82 219 T23 = VSUB(T1Z, T22);
Chris@82 220 T24 = VADD(T1Y, T23);
Chris@82 221 T28 = VSUB(T23, T1Y);
Chris@82 222 T2t = VFNMS(LDK(KP707106781), T24, T1T);
Chris@82 223 STM4(&(ro[10]), T2t, ovs, &(ro[0]));
Chris@82 224 T2u = VFMA(LDK(KP707106781), T28, T27);
Chris@82 225 STM4(&(io[6]), T2u, ovs, &(io[0]));
Chris@82 226 T2v = VFMA(LDK(KP707106781), T24, T1T);
Chris@82 227 STM4(&(ro[2]), T2v, ovs, &(ro[0]));
Chris@82 228 T2w = VFNMS(LDK(KP707106781), T28, T27);
Chris@82 229 STM4(&(io[14]), T2w, ovs, &(io[0]));
Chris@82 230 }
Chris@82 231 {
Chris@82 232 V T29, T2d, T2c, T2e, T2a, T2b;
Chris@82 233 T29 = VSUB(T1R, T1S);
Chris@82 234 T2d = VADD(T26, T25);
Chris@82 235 T2a = VSUB(T1X, T1U);
Chris@82 236 T2b = VADD(T1Z, T22);
Chris@82 237 T2c = VSUB(T2a, T2b);
Chris@82 238 T2e = VADD(T2a, T2b);
Chris@82 239 T2x = VFNMS(LDK(KP707106781), T2c, T29);
Chris@82 240 STM4(&(ro[14]), T2x, ovs, &(ro[0]));
Chris@82 241 T2y = VFMA(LDK(KP707106781), T2e, T2d);
Chris@82 242 STM4(&(io[2]), T2y, ovs, &(io[0]));
Chris@82 243 T2z = VFMA(LDK(KP707106781), T2c, T29);
Chris@82 244 STM4(&(ro[6]), T2z, ovs, &(ro[0]));
Chris@82 245 T2A = VFNMS(LDK(KP707106781), T2e, T2d);
Chris@82 246 STM4(&(io[10]), T2A, ovs, &(io[0]));
Chris@82 247 }
Chris@82 248 {
Chris@82 249 V T2B, T2C, T2D, T2E, T2F, T2G, T2H, T2I;
Chris@82 250 {
Chris@82 251 V TV, T1v, T1p, T1r, T1i, T1q, T1u, T1w, TU, T1o;
Chris@82 252 TU = VSUB(TQ, TT);
Chris@82 253 TV = VFMA(LDK(KP707106781), TU, TN);
Chris@82 254 T1v = VFNMS(LDK(KP707106781), TU, TN);
Chris@82 255 T1o = VSUB(T1m, T1n);
Chris@82 256 T1p = VFNMS(LDK(KP707106781), T1o, T1l);
Chris@82 257 T1r = VFMA(LDK(KP707106781), T1o, T1l);
Chris@82 258 {
Chris@82 259 V T16, T1h, T1s, T1t;
Chris@82 260 T16 = VFMA(LDK(KP414213562), T15, T10);
Chris@82 261 T1h = VFNMS(LDK(KP414213562), T1g, T1b);
Chris@82 262 T1i = VSUB(T16, T1h);
Chris@82 263 T1q = VADD(T16, T1h);
Chris@82 264 T1s = VFMA(LDK(KP414213562), T1b, T1g);
Chris@82 265 T1t = VFNMS(LDK(KP414213562), T10, T15);
Chris@82 266 T1u = VSUB(T1s, T1t);
Chris@82 267 T1w = VADD(T1t, T1s);
Chris@82 268 }
Chris@82 269 T2B = VFNMS(LDK(KP923879532), T1i, TV);
Chris@82 270 STM4(&(ro[11]), T2B, ovs, &(ro[1]));
Chris@82 271 T2C = VFNMS(LDK(KP923879532), T1u, T1r);
Chris@82 272 STM4(&(io[11]), T2C, ovs, &(io[1]));
Chris@82 273 T2D = VFMA(LDK(KP923879532), T1i, TV);
Chris@82 274 STM4(&(ro[3]), T2D, ovs, &(ro[1]));
Chris@82 275 T2E = VFMA(LDK(KP923879532), T1u, T1r);
Chris@82 276 STM4(&(io[3]), T2E, ovs, &(io[1]));
Chris@82 277 T2F = VFNMS(LDK(KP923879532), T1q, T1p);
Chris@82 278 STM4(&(io[7]), T2F, ovs, &(io[1]));
Chris@82 279 T2G = VFNMS(LDK(KP923879532), T1w, T1v);
Chris@82 280 STM4(&(ro[7]), T2G, ovs, &(ro[1]));
Chris@82 281 T2H = VFMA(LDK(KP923879532), T1q, T1p);
Chris@82 282 STM4(&(io[15]), T2H, ovs, &(io[1]));
Chris@82 283 T2I = VFMA(LDK(KP923879532), T1w, T1v);
Chris@82 284 STM4(&(ro[15]), T2I, ovs, &(ro[1]));
Chris@82 285 }
Chris@82 286 {
Chris@82 287 V T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
Chris@82 288 T1y = VADD(T1n, T1m);
Chris@82 289 T1z = VFMA(LDK(KP707106781), T1y, T1x);
Chris@82 290 T1L = VFNMS(LDK(KP707106781), T1y, T1x);
Chris@82 291 T1I = VADD(TQ, TT);
Chris@82 292 T1J = VFNMS(LDK(KP707106781), T1I, T1H);
Chris@82 293 T1P = VFMA(LDK(KP707106781), T1I, T1H);
Chris@82 294 {
Chris@82 295 V T1C, T1F, T1M, T1N;
Chris@82 296 T1C = VFMA(LDK(KP414213562), T1B, T1A);
Chris@82 297 T1F = VFNMS(LDK(KP414213562), T1E, T1D);
Chris@82 298 T1G = VADD(T1C, T1F);
Chris@82 299 T1K = VSUB(T1F, T1C);
Chris@82 300 T1M = VFNMS(LDK(KP414213562), T1A, T1B);
Chris@82 301 T1N = VFMA(LDK(KP414213562), T1D, T1E);
Chris@82 302 T1O = VSUB(T1M, T1N);
Chris@82 303 T1Q = VADD(T1M, T1N);
Chris@82 304 }
Chris@82 305 {
Chris@82 306 V T2J, T2K, T2L, T2M;
Chris@82 307 T2J = VFNMS(LDK(KP923879532), T1G, T1z);
Chris@82 308 STM4(&(ro[9]), T2J, ovs, &(ro[1]));
Chris@82 309 STN4(&(ro[8]), T2l, T2J, T2t, T2B, ovs);
Chris@82 310 T2K = VFNMS(LDK(KP923879532), T1Q, T1P);
Chris@82 311 STM4(&(io[9]), T2K, ovs, &(io[1]));
Chris@82 312 STN4(&(io[8]), T2n, T2K, T2A, T2C, ovs);
Chris@82 313 T2L = VFMA(LDK(KP923879532), T1G, T1z);
Chris@82 314 STM4(&(ro[1]), T2L, ovs, &(ro[1]));
Chris@82 315 STN4(&(ro[0]), T2m, T2L, T2v, T2D, ovs);
Chris@82 316 T2M = VFMA(LDK(KP923879532), T1Q, T1P);
Chris@82 317 STM4(&(io[1]), T2M, ovs, &(io[1]));
Chris@82 318 STN4(&(io[0]), T2o, T2M, T2y, T2E, ovs);
Chris@82 319 }
Chris@82 320 {
Chris@82 321 V T2N, T2O, T2P, T2Q;
Chris@82 322 T2N = VFNMS(LDK(KP923879532), T1K, T1J);
Chris@82 323 STM4(&(io[13]), T2N, ovs, &(io[1]));
Chris@82 324 STN4(&(io[12]), T2q, T2N, T2w, T2H, ovs);
Chris@82 325 T2O = VFNMS(LDK(KP923879532), T1O, T1L);
Chris@82 326 STM4(&(ro[13]), T2O, ovs, &(ro[1]));
Chris@82 327 STN4(&(ro[12]), T2r, T2O, T2x, T2I, ovs);
Chris@82 328 T2P = VFMA(LDK(KP923879532), T1K, T1J);
Chris@82 329 STM4(&(io[5]), T2P, ovs, &(io[1]));
Chris@82 330 STN4(&(io[4]), T2p, T2P, T2u, T2F, ovs);
Chris@82 331 T2Q = VFMA(LDK(KP923879532), T1O, T1L);
Chris@82 332 STM4(&(ro[5]), T2Q, ovs, &(ro[1]));
Chris@82 333 STN4(&(ro[4]), T2s, T2Q, T2z, T2G, ovs);
Chris@82 334 }
Chris@82 335 }
Chris@82 336 }
Chris@82 337 }
Chris@82 338 }
Chris@82 339 }
Chris@82 340 }
Chris@82 341 VLEAVE();
Chris@82 342 }
Chris@82 343
Chris@82 344 static const kdft_desc desc = { 16, XSIMD_STRING("n2sv_16"), {104, 0, 40, 0}, &GENUS, 0, 1, 0, 0 };
Chris@82 345
Chris@82 346 void XSIMD(codelet_n2sv_16) (planner *p) {
Chris@82 347 X(kdft_register) (p, n2sv_16, &desc);
Chris@82 348 }
Chris@82 349
Chris@82 350 #else
Chris@82 351
Chris@82 352 /* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2sv_16 -with-ostride 1 -include dft/simd/n2s.h -store-multiple 4 */
Chris@82 353
Chris@82 354 /*
Chris@82 355 * This function contains 144 FP additions, 24 FP multiplications,
Chris@82 356 * (or, 136 additions, 16 multiplications, 8 fused multiply/add),
Chris@82 357 * 74 stack variables, 3 constants, and 72 memory accesses
Chris@82 358 */
Chris@82 359 #include "dft/simd/n2s.h"
Chris@82 360
Chris@82 361 static void n2sv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 362 {
Chris@82 363 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 364 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 365 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 366 {
Chris@82 367 INT i;
Chris@82 368 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@82 369 V T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
Chris@82 370 V T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
Chris@82 371 V T1U, T1A;
Chris@82 372 {
Chris@82 373 V T3, TL, Ty, T1k, T6, T1j, TB, TM;
Chris@82 374 {
Chris@82 375 V T1, T2, Tw, Tx;
Chris@82 376 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@82 377 T2 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
Chris@82 378 T3 = VADD(T1, T2);
Chris@82 379 TL = VSUB(T1, T2);
Chris@82 380 Tw = LD(&(ii[0]), ivs, &(ii[0]));
Chris@82 381 Tx = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
Chris@82 382 Ty = VADD(Tw, Tx);
Chris@82 383 T1k = VSUB(Tw, Tx);
Chris@82 384 }
Chris@82 385 {
Chris@82 386 V T4, T5, Tz, TA;
Chris@82 387 T4 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@82 388 T5 = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
Chris@82 389 T6 = VADD(T4, T5);
Chris@82 390 T1j = VSUB(T4, T5);
Chris@82 391 Tz = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@82 392 TA = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
Chris@82 393 TB = VADD(Tz, TA);
Chris@82 394 TM = VSUB(Tz, TA);
Chris@82 395 }
Chris@82 396 T7 = VADD(T3, T6);
Chris@82 397 T1R = VSUB(T3, T6);
Chris@82 398 T25 = VSUB(Ty, TB);
Chris@82 399 TC = VADD(Ty, TB);
Chris@82 400 TN = VSUB(TL, TM);
Chris@82 401 T1x = VADD(TL, TM);
Chris@82 402 T1H = VSUB(T1k, T1j);
Chris@82 403 T1l = VADD(T1j, T1k);
Chris@82 404 }
Chris@82 405 {
Chris@82 406 V Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
Chris@82 407 {
Chris@82 408 V Tn, To, T1d, T1e;
Chris@82 409 Tn = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
Chris@82 410 To = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@82 411 Tp = VADD(Tn, To);
Chris@82 412 T17 = VSUB(Tn, To);
Chris@82 413 T1d = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
Chris@82 414 T1e = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@82 415 T1f = VSUB(T1d, T1e);
Chris@82 416 T20 = VADD(T1d, T1e);
Chris@82 417 }
Chris@82 418 {
Chris@82 419 V Tq, Tr, T18, T19;
Chris@82 420 Tq = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@82 421 Tr = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
Chris@82 422 Ts = VADD(Tq, Tr);
Chris@82 423 T1c = VSUB(Tq, Tr);
Chris@82 424 T18 = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@82 425 T19 = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
Chris@82 426 T1a = VSUB(T18, T19);
Chris@82 427 T21 = VADD(T18, T19);
Chris@82 428 }
Chris@82 429 Tt = VADD(Tp, Ts);
Chris@82 430 T22 = VSUB(T20, T21);
Chris@82 431 T2h = VADD(T20, T21);
Chris@82 432 T1b = VSUB(T17, T1a);
Chris@82 433 T1g = VADD(T1c, T1f);
Chris@82 434 T1E = VSUB(T1f, T1c);
Chris@82 435 T1Z = VSUB(Tp, Ts);
Chris@82 436 T1D = VADD(T17, T1a);
Chris@82 437 }
Chris@82 438 {
Chris@82 439 V Ta, TP, TF, TO, Td, TR, TI, TS;
Chris@82 440 {
Chris@82 441 V T8, T9, TD, TE;
Chris@82 442 T8 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@82 443 T9 = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
Chris@82 444 Ta = VADD(T8, T9);
Chris@82 445 TP = VSUB(T8, T9);
Chris@82 446 TD = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@82 447 TE = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
Chris@82 448 TF = VADD(TD, TE);
Chris@82 449 TO = VSUB(TD, TE);
Chris@82 450 }
Chris@82 451 {
Chris@82 452 V Tb, Tc, TG, TH;
Chris@82 453 Tb = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
Chris@82 454 Tc = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@82 455 Td = VADD(Tb, Tc);
Chris@82 456 TR = VSUB(Tb, Tc);
Chris@82 457 TG = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
Chris@82 458 TH = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@82 459 TI = VADD(TG, TH);
Chris@82 460 TS = VSUB(TG, TH);
Chris@82 461 }
Chris@82 462 Te = VADD(Ta, Td);
Chris@82 463 T1S = VSUB(TF, TI);
Chris@82 464 T26 = VSUB(Td, Ta);
Chris@82 465 TJ = VADD(TF, TI);
Chris@82 466 TQ = VSUB(TO, TP);
Chris@82 467 T1m = VSUB(TR, TS);
Chris@82 468 T1n = VADD(TP, TO);
Chris@82 469 TT = VADD(TR, TS);
Chris@82 470 }
Chris@82 471 {
Chris@82 472 V Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
Chris@82 473 {
Chris@82 474 V Tg, Th, TX, TY;
Chris@82 475 Tg = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@82 476 Th = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
Chris@82 477 Ti = VADD(Tg, Th);
Chris@82 478 T11 = VSUB(Tg, Th);
Chris@82 479 TX = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@82 480 TY = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
Chris@82 481 TZ = VSUB(TX, TY);
Chris@82 482 T1V = VADD(TX, TY);
Chris@82 483 }
Chris@82 484 {
Chris@82 485 V Tj, Tk, T12, T13;
Chris@82 486 Tj = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@82 487 Tk = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
Chris@82 488 Tl = VADD(Tj, Tk);
Chris@82 489 TW = VSUB(Tj, Tk);
Chris@82 490 T12 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@82 491 T13 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
Chris@82 492 T14 = VSUB(T12, T13);
Chris@82 493 T1W = VADD(T12, T13);
Chris@82 494 }
Chris@82 495 Tm = VADD(Ti, Tl);
Chris@82 496 T1X = VSUB(T1V, T1W);
Chris@82 497 T2g = VADD(T1V, T1W);
Chris@82 498 T10 = VADD(TW, TZ);
Chris@82 499 T15 = VSUB(T11, T14);
Chris@82 500 T1B = VADD(T11, T14);
Chris@82 501 T1U = VSUB(Ti, Tl);
Chris@82 502 T1A = VSUB(TZ, TW);
Chris@82 503 }
Chris@82 504 {
Chris@82 505 V T2l, T2m, T2n, T2o, T2p, T2q, T2r, T2s;
Chris@82 506 {
Chris@82 507 V Tf, Tu, T2j, T2k;
Chris@82 508 Tf = VADD(T7, Te);
Chris@82 509 Tu = VADD(Tm, Tt);
Chris@82 510 T2l = VSUB(Tf, Tu);
Chris@82 511 STM4(&(ro[8]), T2l, ovs, &(ro[0]));
Chris@82 512 T2m = VADD(Tf, Tu);
Chris@82 513 STM4(&(ro[0]), T2m, ovs, &(ro[0]));
Chris@82 514 T2j = VADD(TC, TJ);
Chris@82 515 T2k = VADD(T2g, T2h);
Chris@82 516 T2n = VSUB(T2j, T2k);
Chris@82 517 STM4(&(io[8]), T2n, ovs, &(io[0]));
Chris@82 518 T2o = VADD(T2j, T2k);
Chris@82 519 STM4(&(io[0]), T2o, ovs, &(io[0]));
Chris@82 520 }
Chris@82 521 {
Chris@82 522 V Tv, TK, T2f, T2i;
Chris@82 523 Tv = VSUB(Tt, Tm);
Chris@82 524 TK = VSUB(TC, TJ);
Chris@82 525 T2p = VADD(Tv, TK);
Chris@82 526 STM4(&(io[4]), T2p, ovs, &(io[0]));
Chris@82 527 T2q = VSUB(TK, Tv);
Chris@82 528 STM4(&(io[12]), T2q, ovs, &(io[0]));
Chris@82 529 T2f = VSUB(T7, Te);
Chris@82 530 T2i = VSUB(T2g, T2h);
Chris@82 531 T2r = VSUB(T2f, T2i);
Chris@82 532 STM4(&(ro[12]), T2r, ovs, &(ro[0]));
Chris@82 533 T2s = VADD(T2f, T2i);
Chris@82 534 STM4(&(ro[4]), T2s, ovs, &(ro[0]));
Chris@82 535 }
Chris@82 536 {
Chris@82 537 V T2t, T2u, T2v, T2w, T2x, T2y, T2z, T2A;
Chris@82 538 {
Chris@82 539 V T1T, T27, T24, T28, T1Y, T23;
Chris@82 540 T1T = VADD(T1R, T1S);
Chris@82 541 T27 = VSUB(T25, T26);
Chris@82 542 T1Y = VADD(T1U, T1X);
Chris@82 543 T23 = VSUB(T1Z, T22);
Chris@82 544 T24 = VMUL(LDK(KP707106781), VADD(T1Y, T23));
Chris@82 545 T28 = VMUL(LDK(KP707106781), VSUB(T23, T1Y));
Chris@82 546 T2t = VSUB(T1T, T24);
Chris@82 547 STM4(&(ro[10]), T2t, ovs, &(ro[0]));
Chris@82 548 T2u = VADD(T27, T28);
Chris@82 549 STM4(&(io[6]), T2u, ovs, &(io[0]));
Chris@82 550 T2v = VADD(T1T, T24);
Chris@82 551 STM4(&(ro[2]), T2v, ovs, &(ro[0]));
Chris@82 552 T2w = VSUB(T27, T28);
Chris@82 553 STM4(&(io[14]), T2w, ovs, &(io[0]));
Chris@82 554 }
Chris@82 555 {
Chris@82 556 V T29, T2d, T2c, T2e, T2a, T2b;
Chris@82 557 T29 = VSUB(T1R, T1S);
Chris@82 558 T2d = VADD(T26, T25);
Chris@82 559 T2a = VSUB(T1X, T1U);
Chris@82 560 T2b = VADD(T1Z, T22);
Chris@82 561 T2c = VMUL(LDK(KP707106781), VSUB(T2a, T2b));
Chris@82 562 T2e = VMUL(LDK(KP707106781), VADD(T2a, T2b));
Chris@82 563 T2x = VSUB(T29, T2c);
Chris@82 564 STM4(&(ro[14]), T2x, ovs, &(ro[0]));
Chris@82 565 T2y = VADD(T2d, T2e);
Chris@82 566 STM4(&(io[2]), T2y, ovs, &(io[0]));
Chris@82 567 T2z = VADD(T29, T2c);
Chris@82 568 STM4(&(ro[6]), T2z, ovs, &(ro[0]));
Chris@82 569 T2A = VSUB(T2d, T2e);
Chris@82 570 STM4(&(io[10]), T2A, ovs, &(io[0]));
Chris@82 571 }
Chris@82 572 {
Chris@82 573 V T2B, T2C, T2D, T2E, T2F, T2G, T2H, T2I;
Chris@82 574 {
Chris@82 575 V TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
Chris@82 576 TU = VMUL(LDK(KP707106781), VSUB(TQ, TT));
Chris@82 577 TV = VADD(TN, TU);
Chris@82 578 T1r = VSUB(TN, TU);
Chris@82 579 T1o = VMUL(LDK(KP707106781), VSUB(T1m, T1n));
Chris@82 580 T1p = VSUB(T1l, T1o);
Chris@82 581 T1v = VADD(T1l, T1o);
Chris@82 582 {
Chris@82 583 V T16, T1h, T1s, T1t;
Chris@82 584 T16 = VFMA(LDK(KP923879532), T10, VMUL(LDK(KP382683432), T15));
Chris@82 585 T1h = VFNMS(LDK(KP923879532), T1g, VMUL(LDK(KP382683432), T1b));
Chris@82 586 T1i = VADD(T16, T1h);
Chris@82 587 T1q = VSUB(T1h, T16);
Chris@82 588 T1s = VFNMS(LDK(KP923879532), T15, VMUL(LDK(KP382683432), T10));
Chris@82 589 T1t = VFMA(LDK(KP382683432), T1g, VMUL(LDK(KP923879532), T1b));
Chris@82 590 T1u = VSUB(T1s, T1t);
Chris@82 591 T1w = VADD(T1s, T1t);
Chris@82 592 }
Chris@82 593 T2B = VSUB(TV, T1i);
Chris@82 594 STM4(&(ro[11]), T2B, ovs, &(ro[1]));
Chris@82 595 T2C = VSUB(T1v, T1w);
Chris@82 596 STM4(&(io[11]), T2C, ovs, &(io[1]));
Chris@82 597 T2D = VADD(TV, T1i);
Chris@82 598 STM4(&(ro[3]), T2D, ovs, &(ro[1]));
Chris@82 599 T2E = VADD(T1v, T1w);
Chris@82 600 STM4(&(io[3]), T2E, ovs, &(io[1]));
Chris@82 601 T2F = VSUB(T1p, T1q);
Chris@82 602 STM4(&(io[15]), T2F, ovs, &(io[1]));
Chris@82 603 T2G = VSUB(T1r, T1u);
Chris@82 604 STM4(&(ro[15]), T2G, ovs, &(ro[1]));
Chris@82 605 T2H = VADD(T1p, T1q);
Chris@82 606 STM4(&(io[7]), T2H, ovs, &(io[1]));
Chris@82 607 T2I = VADD(T1r, T1u);
Chris@82 608 STM4(&(ro[7]), T2I, ovs, &(ro[1]));
Chris@82 609 }
Chris@82 610 {
Chris@82 611 V T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
Chris@82 612 T1y = VMUL(LDK(KP707106781), VADD(T1n, T1m));
Chris@82 613 T1z = VADD(T1x, T1y);
Chris@82 614 T1L = VSUB(T1x, T1y);
Chris@82 615 T1I = VMUL(LDK(KP707106781), VADD(TQ, TT));
Chris@82 616 T1J = VSUB(T1H, T1I);
Chris@82 617 T1P = VADD(T1H, T1I);
Chris@82 618 {
Chris@82 619 V T1C, T1F, T1M, T1N;
Chris@82 620 T1C = VFMA(LDK(KP382683432), T1A, VMUL(LDK(KP923879532), T1B));
Chris@82 621 T1F = VFNMS(LDK(KP382683432), T1E, VMUL(LDK(KP923879532), T1D));
Chris@82 622 T1G = VADD(T1C, T1F);
Chris@82 623 T1K = VSUB(T1F, T1C);
Chris@82 624 T1M = VFNMS(LDK(KP382683432), T1B, VMUL(LDK(KP923879532), T1A));
Chris@82 625 T1N = VFMA(LDK(KP923879532), T1E, VMUL(LDK(KP382683432), T1D));
Chris@82 626 T1O = VSUB(T1M, T1N);
Chris@82 627 T1Q = VADD(T1M, T1N);
Chris@82 628 }
Chris@82 629 {
Chris@82 630 V T2J, T2K, T2L, T2M;
Chris@82 631 T2J = VSUB(T1z, T1G);
Chris@82 632 STM4(&(ro[9]), T2J, ovs, &(ro[1]));
Chris@82 633 STN4(&(ro[8]), T2l, T2J, T2t, T2B, ovs);
Chris@82 634 T2K = VSUB(T1P, T1Q);
Chris@82 635 STM4(&(io[9]), T2K, ovs, &(io[1]));
Chris@82 636 STN4(&(io[8]), T2n, T2K, T2A, T2C, ovs);
Chris@82 637 T2L = VADD(T1z, T1G);
Chris@82 638 STM4(&(ro[1]), T2L, ovs, &(ro[1]));
Chris@82 639 STN4(&(ro[0]), T2m, T2L, T2v, T2D, ovs);
Chris@82 640 T2M = VADD(T1P, T1Q);
Chris@82 641 STM4(&(io[1]), T2M, ovs, &(io[1]));
Chris@82 642 STN4(&(io[0]), T2o, T2M, T2y, T2E, ovs);
Chris@82 643 }
Chris@82 644 {
Chris@82 645 V T2N, T2O, T2P, T2Q;
Chris@82 646 T2N = VSUB(T1J, T1K);
Chris@82 647 STM4(&(io[13]), T2N, ovs, &(io[1]));
Chris@82 648 STN4(&(io[12]), T2q, T2N, T2w, T2F, ovs);
Chris@82 649 T2O = VSUB(T1L, T1O);
Chris@82 650 STM4(&(ro[13]), T2O, ovs, &(ro[1]));
Chris@82 651 STN4(&(ro[12]), T2r, T2O, T2x, T2G, ovs);
Chris@82 652 T2P = VADD(T1J, T1K);
Chris@82 653 STM4(&(io[5]), T2P, ovs, &(io[1]));
Chris@82 654 STN4(&(io[4]), T2p, T2P, T2u, T2H, ovs);
Chris@82 655 T2Q = VADD(T1L, T1O);
Chris@82 656 STM4(&(ro[5]), T2Q, ovs, &(ro[1]));
Chris@82 657 STN4(&(ro[4]), T2s, T2Q, T2z, T2I, ovs);
Chris@82 658 }
Chris@82 659 }
Chris@82 660 }
Chris@82 661 }
Chris@82 662 }
Chris@82 663 }
Chris@82 664 }
Chris@82 665 VLEAVE();
Chris@82 666 }
Chris@82 667
Chris@82 668 static const kdft_desc desc = { 16, XSIMD_STRING("n2sv_16"), {136, 16, 8, 0}, &GENUS, 0, 1, 0, 0 };
Chris@82 669
Chris@82 670 void XSIMD(codelet_n2sv_16) (planner *p) {
Chris@82 671 X(kdft_register) (p, n2sv_16, &desc);
Chris@82 672 }
Chris@82 673
Chris@82 674 #endif