annotate src/fftw-3.3.5/dft/simd/common/n2sv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:41:20 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2sv_16 -with-ostride 1 -include n2s.h -store-multiple 4 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 144 FP additions, 40 FP multiplications,
Chris@42 32 * (or, 104 additions, 0 multiplications, 40 fused multiply/add),
Chris@42 33 * 110 stack variables, 3 constants, and 72 memory accesses
Chris@42 34 */
Chris@42 35 #include "n2s.h"
Chris@42 36
Chris@42 37 static void n2sv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 41 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 42 {
Chris@42 43 INT i;
Chris@42 44 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@42 45 V T2p, T2q, T2r, T2s, T2x, T2y, T2z, T2A, T1M, T1N, T1L, T1P, T2F, T2G, T2H;
Chris@42 46 V T2I, T1O, T1Q;
Chris@42 47 {
Chris@42 48 V T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt, T2h, T22, T1D;
Chris@42 49 V T1g, T1n, TQ, T11, Ti, Te, T26, T1m, TT, T1S, TJ, TZ, T1V, TW, Tl;
Chris@42 50 V T12, T13;
Chris@42 51 {
Chris@42 52 V Tq, T1c, Tp, T20, T1a, Tr, T1d, T1e;
Chris@42 53 {
Chris@42 54 V T1, T2, Tw, Tx, T4, T5, Tz, TA;
Chris@42 55 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@42 56 T2 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
Chris@42 57 Tw = LD(&(ii[0]), ivs, &(ii[0]));
Chris@42 58 Tx = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
Chris@42 59 T4 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@42 60 T5 = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
Chris@42 61 Tz = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@42 62 TA = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
Chris@42 63 {
Chris@42 64 V Tn, TL, T3, T1k, Ty, T1j, T6, TM, TB, To, T18, T19;
Chris@42 65 Tn = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
Chris@42 66 TL = VSUB(T1, T2);
Chris@42 67 T3 = VADD(T1, T2);
Chris@42 68 T1k = VSUB(Tw, Tx);
Chris@42 69 Ty = VADD(Tw, Tx);
Chris@42 70 T1j = VSUB(T4, T5);
Chris@42 71 T6 = VADD(T4, T5);
Chris@42 72 TM = VSUB(Tz, TA);
Chris@42 73 TB = VADD(Tz, TA);
Chris@42 74 To = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@42 75 T18 = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
Chris@42 76 T19 = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@42 77 Tq = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@42 78 T1l = VADD(T1j, T1k);
Chris@42 79 T1H = VSUB(T1k, T1j);
Chris@42 80 T1R = VSUB(T3, T6);
Chris@42 81 T7 = VADD(T3, T6);
Chris@42 82 T1x = VADD(TL, TM);
Chris@42 83 TN = VSUB(TL, TM);
Chris@42 84 TC = VADD(Ty, TB);
Chris@42 85 T25 = VSUB(Ty, TB);
Chris@42 86 T1c = VSUB(Tn, To);
Chris@42 87 Tp = VADD(Tn, To);
Chris@42 88 T20 = VADD(T18, T19);
Chris@42 89 T1a = VSUB(T18, T19);
Chris@42 90 Tr = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
Chris@42 91 T1d = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@42 92 T1e = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
Chris@42 93 }
Chris@42 94 }
Chris@42 95 {
Chris@42 96 V Tb, Ta, TF, Tc, TG, TH, TP, TO;
Chris@42 97 {
Chris@42 98 V T8, T9, TD, TE;
Chris@42 99 T8 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@42 100 T9 = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
Chris@42 101 TD = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@42 102 TE = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
Chris@42 103 Tb = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
Chris@42 104 {
Chris@42 105 V T17, Ts, T21, T1f;
Chris@42 106 T17 = VSUB(Tq, Tr);
Chris@42 107 Ts = VADD(Tq, Tr);
Chris@42 108 T21 = VADD(T1d, T1e);
Chris@42 109 T1f = VSUB(T1d, T1e);
Chris@42 110 TP = VSUB(T8, T9);
Chris@42 111 Ta = VADD(T8, T9);
Chris@42 112 TO = VSUB(TD, TE);
Chris@42 113 TF = VADD(TD, TE);
Chris@42 114 T1E = VSUB(T1a, T17);
Chris@42 115 T1b = VADD(T17, T1a);
Chris@42 116 T1Z = VSUB(Tp, Ts);
Chris@42 117 Tt = VADD(Tp, Ts);
Chris@42 118 T2h = VADD(T20, T21);
Chris@42 119 T22 = VSUB(T20, T21);
Chris@42 120 T1D = VADD(T1c, T1f);
Chris@42 121 T1g = VSUB(T1c, T1f);
Chris@42 122 Tc = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@42 123 }
Chris@42 124 TG = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
Chris@42 125 TH = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@42 126 }
Chris@42 127 T1n = VADD(TP, TO);
Chris@42 128 TQ = VSUB(TO, TP);
Chris@42 129 {
Chris@42 130 V Tg, Th, TX, TR, Td, TS, TI, TY, Tj, Tk;
Chris@42 131 Tg = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@42 132 Th = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
Chris@42 133 TX = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@42 134 TR = VSUB(Tb, Tc);
Chris@42 135 Td = VADD(Tb, Tc);
Chris@42 136 TS = VSUB(TG, TH);
Chris@42 137 TI = VADD(TG, TH);
Chris@42 138 TY = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
Chris@42 139 Tj = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@42 140 T11 = VSUB(Tg, Th);
Chris@42 141 Ti = VADD(Tg, Th);
Chris@42 142 Tk = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
Chris@42 143 Te = VADD(Ta, Td);
Chris@42 144 T26 = VSUB(Td, Ta);
Chris@42 145 T1m = VSUB(TR, TS);
Chris@42 146 TT = VADD(TR, TS);
Chris@42 147 T1S = VSUB(TF, TI);
Chris@42 148 TJ = VADD(TF, TI);
Chris@42 149 TZ = VSUB(TX, TY);
Chris@42 150 T1V = VADD(TX, TY);
Chris@42 151 TW = VSUB(Tj, Tk);
Chris@42 152 Tl = VADD(Tj, Tk);
Chris@42 153 T12 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@42 154 T13 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
Chris@42 155 }
Chris@42 156 }
Chris@42 157 }
Chris@42 158 {
Chris@42 159 V T2f, Tf, T2j, TK, Tm, T1U, T10, T1B, T14, T1W;
Chris@42 160 T2f = VSUB(T7, Te);
Chris@42 161 Tf = VADD(T7, Te);
Chris@42 162 T2j = VADD(TC, TJ);
Chris@42 163 TK = VSUB(TC, TJ);
Chris@42 164 Tm = VADD(Ti, Tl);
Chris@42 165 T1U = VSUB(Ti, Tl);
Chris@42 166 T10 = VADD(TW, TZ);
Chris@42 167 T1B = VSUB(TZ, TW);
Chris@42 168 T14 = VSUB(T12, T13);
Chris@42 169 T1W = VADD(T12, T13);
Chris@42 170 {
Chris@42 171 V T29, T1T, T27, T2d, T2b, T23, T15, T1A, T2l, T2m, T2n, T2o, T2i, T2k, T1Y;
Chris@42 172 V T2a;
Chris@42 173 {
Chris@42 174 V Tv, Tu, T1X, T2g;
Chris@42 175 T29 = VSUB(T1R, T1S);
Chris@42 176 T1T = VADD(T1R, T1S);
Chris@42 177 T27 = VSUB(T25, T26);
Chris@42 178 T2d = VADD(T26, T25);
Chris@42 179 T2b = VADD(T1Z, T22);
Chris@42 180 T23 = VSUB(T1Z, T22);
Chris@42 181 Tv = VSUB(Tt, Tm);
Chris@42 182 Tu = VADD(Tm, Tt);
Chris@42 183 T1X = VSUB(T1V, T1W);
Chris@42 184 T2g = VADD(T1V, T1W);
Chris@42 185 T15 = VSUB(T11, T14);
Chris@42 186 T1A = VADD(T11, T14);
Chris@42 187 T2l = VSUB(TK, Tv);
Chris@42 188 STM4(&(io[12]), T2l, ovs, &(io[0]));
Chris@42 189 T2m = VADD(Tv, TK);
Chris@42 190 STM4(&(io[4]), T2m, ovs, &(io[0]));
Chris@42 191 T2n = VADD(Tf, Tu);
Chris@42 192 STM4(&(ro[0]), T2n, ovs, &(ro[0]));
Chris@42 193 T2o = VSUB(Tf, Tu);
Chris@42 194 STM4(&(ro[8]), T2o, ovs, &(ro[0]));
Chris@42 195 T2i = VSUB(T2g, T2h);
Chris@42 196 T2k = VADD(T2g, T2h);
Chris@42 197 T1Y = VADD(T1U, T1X);
Chris@42 198 T2a = VSUB(T1X, T1U);
Chris@42 199 }
Chris@42 200 {
Chris@42 201 V T1I, T1y, T1t, T16, T1v, TV, T1r, T1p, T2t, T2u, T2v, T2w, T1h, T1s, TU;
Chris@42 202 V T1o;
Chris@42 203 T1I = VADD(TQ, TT);
Chris@42 204 TU = VSUB(TQ, TT);
Chris@42 205 T1o = VSUB(T1m, T1n);
Chris@42 206 T1y = VADD(T1n, T1m);
Chris@42 207 T1t = VFNMS(LDK(KP414213562), T10, T15);
Chris@42 208 T16 = VFMA(LDK(KP414213562), T15, T10);
Chris@42 209 T2p = VADD(T2f, T2i);
Chris@42 210 STM4(&(ro[4]), T2p, ovs, &(ro[0]));
Chris@42 211 T2q = VSUB(T2f, T2i);
Chris@42 212 STM4(&(ro[12]), T2q, ovs, &(ro[0]));
Chris@42 213 T2r = VADD(T2j, T2k);
Chris@42 214 STM4(&(io[0]), T2r, ovs, &(io[0]));
Chris@42 215 T2s = VSUB(T2j, T2k);
Chris@42 216 STM4(&(io[8]), T2s, ovs, &(io[0]));
Chris@42 217 {
Chris@42 218 V T28, T24, T2e, T2c;
Chris@42 219 T28 = VSUB(T23, T1Y);
Chris@42 220 T24 = VADD(T1Y, T23);
Chris@42 221 T2e = VADD(T2a, T2b);
Chris@42 222 T2c = VSUB(T2a, T2b);
Chris@42 223 T1v = VFNMS(LDK(KP707106781), TU, TN);
Chris@42 224 TV = VFMA(LDK(KP707106781), TU, TN);
Chris@42 225 T1r = VFMA(LDK(KP707106781), T1o, T1l);
Chris@42 226 T1p = VFNMS(LDK(KP707106781), T1o, T1l);
Chris@42 227 T2t = VFNMS(LDK(KP707106781), T28, T27);
Chris@42 228 STM4(&(io[14]), T2t, ovs, &(io[0]));
Chris@42 229 T2u = VFMA(LDK(KP707106781), T28, T27);
Chris@42 230 STM4(&(io[6]), T2u, ovs, &(io[0]));
Chris@42 231 T2v = VFMA(LDK(KP707106781), T24, T1T);
Chris@42 232 STM4(&(ro[2]), T2v, ovs, &(ro[0]));
Chris@42 233 T2w = VFNMS(LDK(KP707106781), T24, T1T);
Chris@42 234 STM4(&(ro[10]), T2w, ovs, &(ro[0]));
Chris@42 235 T2x = VFNMS(LDK(KP707106781), T2e, T2d);
Chris@42 236 STM4(&(io[10]), T2x, ovs, &(io[0]));
Chris@42 237 T2y = VFMA(LDK(KP707106781), T2e, T2d);
Chris@42 238 STM4(&(io[2]), T2y, ovs, &(io[0]));
Chris@42 239 T2z = VFMA(LDK(KP707106781), T2c, T29);
Chris@42 240 STM4(&(ro[6]), T2z, ovs, &(ro[0]));
Chris@42 241 T2A = VFNMS(LDK(KP707106781), T2c, T29);
Chris@42 242 STM4(&(ro[14]), T2A, ovs, &(ro[0]));
Chris@42 243 T1h = VFNMS(LDK(KP414213562), T1g, T1b);
Chris@42 244 T1s = VFMA(LDK(KP414213562), T1b, T1g);
Chris@42 245 }
Chris@42 246 {
Chris@42 247 V T1z, T1J, T1K, T1G, T2B, T2C, T2D, T2E, T1C, T1F;
Chris@42 248 T1M = VFNMS(LDK(KP414213562), T1A, T1B);
Chris@42 249 T1C = VFMA(LDK(KP414213562), T1B, T1A);
Chris@42 250 T1F = VFNMS(LDK(KP414213562), T1E, T1D);
Chris@42 251 T1N = VFMA(LDK(KP414213562), T1D, T1E);
Chris@42 252 {
Chris@42 253 V T1q, T1i, T1w, T1u;
Chris@42 254 T1q = VADD(T16, T1h);
Chris@42 255 T1i = VSUB(T16, T1h);
Chris@42 256 T1w = VADD(T1t, T1s);
Chris@42 257 T1u = VSUB(T1s, T1t);
Chris@42 258 T1L = VFNMS(LDK(KP707106781), T1y, T1x);
Chris@42 259 T1z = VFMA(LDK(KP707106781), T1y, T1x);
Chris@42 260 T1P = VFMA(LDK(KP707106781), T1I, T1H);
Chris@42 261 T1J = VFNMS(LDK(KP707106781), T1I, T1H);
Chris@42 262 T1K = VSUB(T1F, T1C);
Chris@42 263 T1G = VADD(T1C, T1F);
Chris@42 264 T2B = VFMA(LDK(KP923879532), T1q, T1p);
Chris@42 265 STM4(&(io[15]), T2B, ovs, &(io[1]));
Chris@42 266 T2C = VFNMS(LDK(KP923879532), T1q, T1p);
Chris@42 267 STM4(&(io[7]), T2C, ovs, &(io[1]));
Chris@42 268 T2D = VFMA(LDK(KP923879532), T1i, TV);
Chris@42 269 STM4(&(ro[3]), T2D, ovs, &(ro[1]));
Chris@42 270 T2E = VFNMS(LDK(KP923879532), T1i, TV);
Chris@42 271 STM4(&(ro[11]), T2E, ovs, &(ro[1]));
Chris@42 272 T2F = VFMA(LDK(KP923879532), T1w, T1v);
Chris@42 273 STM4(&(ro[15]), T2F, ovs, &(ro[1]));
Chris@42 274 T2G = VFNMS(LDK(KP923879532), T1w, T1v);
Chris@42 275 STM4(&(ro[7]), T2G, ovs, &(ro[1]));
Chris@42 276 T2H = VFMA(LDK(KP923879532), T1u, T1r);
Chris@42 277 STM4(&(io[3]), T2H, ovs, &(io[1]));
Chris@42 278 T2I = VFNMS(LDK(KP923879532), T1u, T1r);
Chris@42 279 STM4(&(io[11]), T2I, ovs, &(io[1]));
Chris@42 280 }
Chris@42 281 {
Chris@42 282 V T2J, T2K, T2L, T2M;
Chris@42 283 T2J = VFNMS(LDK(KP923879532), T1G, T1z);
Chris@42 284 STM4(&(ro[9]), T2J, ovs, &(ro[1]));
Chris@42 285 STN4(&(ro[8]), T2o, T2J, T2w, T2E, ovs);
Chris@42 286 T2K = VFMA(LDK(KP923879532), T1G, T1z);
Chris@42 287 STM4(&(ro[1]), T2K, ovs, &(ro[1]));
Chris@42 288 STN4(&(ro[0]), T2n, T2K, T2v, T2D, ovs);
Chris@42 289 T2L = VFNMS(LDK(KP923879532), T1K, T1J);
Chris@42 290 STM4(&(io[13]), T2L, ovs, &(io[1]));
Chris@42 291 STN4(&(io[12]), T2l, T2L, T2t, T2B, ovs);
Chris@42 292 T2M = VFMA(LDK(KP923879532), T1K, T1J);
Chris@42 293 STM4(&(io[5]), T2M, ovs, &(io[1]));
Chris@42 294 STN4(&(io[4]), T2m, T2M, T2u, T2C, ovs);
Chris@42 295 }
Chris@42 296 }
Chris@42 297 }
Chris@42 298 }
Chris@42 299 }
Chris@42 300 }
Chris@42 301 T1O = VSUB(T1M, T1N);
Chris@42 302 T1Q = VADD(T1M, T1N);
Chris@42 303 {
Chris@42 304 V T2N, T2O, T2P, T2Q;
Chris@42 305 T2N = VFMA(LDK(KP923879532), T1Q, T1P);
Chris@42 306 STM4(&(io[1]), T2N, ovs, &(io[1]));
Chris@42 307 STN4(&(io[0]), T2r, T2N, T2y, T2H, ovs);
Chris@42 308 T2O = VFNMS(LDK(KP923879532), T1Q, T1P);
Chris@42 309 STM4(&(io[9]), T2O, ovs, &(io[1]));
Chris@42 310 STN4(&(io[8]), T2s, T2O, T2x, T2I, ovs);
Chris@42 311 T2P = VFMA(LDK(KP923879532), T1O, T1L);
Chris@42 312 STM4(&(ro[5]), T2P, ovs, &(ro[1]));
Chris@42 313 STN4(&(ro[4]), T2p, T2P, T2z, T2G, ovs);
Chris@42 314 T2Q = VFNMS(LDK(KP923879532), T1O, T1L);
Chris@42 315 STM4(&(ro[13]), T2Q, ovs, &(ro[1]));
Chris@42 316 STN4(&(ro[12]), T2q, T2Q, T2A, T2F, ovs);
Chris@42 317 }
Chris@42 318 }
Chris@42 319 }
Chris@42 320 VLEAVE();
Chris@42 321 }
Chris@42 322
Chris@42 323 static const kdft_desc desc = { 16, XSIMD_STRING("n2sv_16"), {104, 0, 40, 0}, &GENUS, 0, 1, 0, 0 };
Chris@42 324
Chris@42 325 void XSIMD(codelet_n2sv_16) (planner *p) {
Chris@42 326 X(kdft_register) (p, n2sv_16, &desc);
Chris@42 327 }
Chris@42 328
Chris@42 329 #else /* HAVE_FMA */
Chris@42 330
Chris@42 331 /* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n2sv_16 -with-ostride 1 -include n2s.h -store-multiple 4 */
Chris@42 332
Chris@42 333 /*
Chris@42 334 * This function contains 144 FP additions, 24 FP multiplications,
Chris@42 335 * (or, 136 additions, 16 multiplications, 8 fused multiply/add),
Chris@42 336 * 74 stack variables, 3 constants, and 72 memory accesses
Chris@42 337 */
Chris@42 338 #include "n2s.h"
Chris@42 339
Chris@42 340 static void n2sv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 341 {
Chris@42 342 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 343 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 344 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 345 {
Chris@42 346 INT i;
Chris@42 347 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
Chris@42 348 V T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
Chris@42 349 V T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
Chris@42 350 V T1U, T1A;
Chris@42 351 {
Chris@42 352 V T3, TL, Ty, T1k, T6, T1j, TB, TM;
Chris@42 353 {
Chris@42 354 V T1, T2, Tw, Tx;
Chris@42 355 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@42 356 T2 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
Chris@42 357 T3 = VADD(T1, T2);
Chris@42 358 TL = VSUB(T1, T2);
Chris@42 359 Tw = LD(&(ii[0]), ivs, &(ii[0]));
Chris@42 360 Tx = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
Chris@42 361 Ty = VADD(Tw, Tx);
Chris@42 362 T1k = VSUB(Tw, Tx);
Chris@42 363 }
Chris@42 364 {
Chris@42 365 V T4, T5, Tz, TA;
Chris@42 366 T4 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@42 367 T5 = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
Chris@42 368 T6 = VADD(T4, T5);
Chris@42 369 T1j = VSUB(T4, T5);
Chris@42 370 Tz = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@42 371 TA = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
Chris@42 372 TB = VADD(Tz, TA);
Chris@42 373 TM = VSUB(Tz, TA);
Chris@42 374 }
Chris@42 375 T7 = VADD(T3, T6);
Chris@42 376 T1R = VSUB(T3, T6);
Chris@42 377 T25 = VSUB(Ty, TB);
Chris@42 378 TC = VADD(Ty, TB);
Chris@42 379 TN = VSUB(TL, TM);
Chris@42 380 T1x = VADD(TL, TM);
Chris@42 381 T1H = VSUB(T1k, T1j);
Chris@42 382 T1l = VADD(T1j, T1k);
Chris@42 383 }
Chris@42 384 {
Chris@42 385 V Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
Chris@42 386 {
Chris@42 387 V Tn, To, T1d, T1e;
Chris@42 388 Tn = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
Chris@42 389 To = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@42 390 Tp = VADD(Tn, To);
Chris@42 391 T17 = VSUB(Tn, To);
Chris@42 392 T1d = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
Chris@42 393 T1e = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@42 394 T1f = VSUB(T1d, T1e);
Chris@42 395 T20 = VADD(T1d, T1e);
Chris@42 396 }
Chris@42 397 {
Chris@42 398 V Tq, Tr, T18, T19;
Chris@42 399 Tq = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@42 400 Tr = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
Chris@42 401 Ts = VADD(Tq, Tr);
Chris@42 402 T1c = VSUB(Tq, Tr);
Chris@42 403 T18 = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@42 404 T19 = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
Chris@42 405 T1a = VSUB(T18, T19);
Chris@42 406 T21 = VADD(T18, T19);
Chris@42 407 }
Chris@42 408 Tt = VADD(Tp, Ts);
Chris@42 409 T22 = VSUB(T20, T21);
Chris@42 410 T2h = VADD(T20, T21);
Chris@42 411 T1b = VSUB(T17, T1a);
Chris@42 412 T1g = VADD(T1c, T1f);
Chris@42 413 T1E = VSUB(T1f, T1c);
Chris@42 414 T1Z = VSUB(Tp, Ts);
Chris@42 415 T1D = VADD(T17, T1a);
Chris@42 416 }
Chris@42 417 {
Chris@42 418 V Ta, TP, TF, TO, Td, TR, TI, TS;
Chris@42 419 {
Chris@42 420 V T8, T9, TD, TE;
Chris@42 421 T8 = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@42 422 T9 = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
Chris@42 423 Ta = VADD(T8, T9);
Chris@42 424 TP = VSUB(T8, T9);
Chris@42 425 TD = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@42 426 TE = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
Chris@42 427 TF = VADD(TD, TE);
Chris@42 428 TO = VSUB(TD, TE);
Chris@42 429 }
Chris@42 430 {
Chris@42 431 V Tb, Tc, TG, TH;
Chris@42 432 Tb = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
Chris@42 433 Tc = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@42 434 Td = VADD(Tb, Tc);
Chris@42 435 TR = VSUB(Tb, Tc);
Chris@42 436 TG = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
Chris@42 437 TH = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@42 438 TI = VADD(TG, TH);
Chris@42 439 TS = VSUB(TG, TH);
Chris@42 440 }
Chris@42 441 Te = VADD(Ta, Td);
Chris@42 442 T1S = VSUB(TF, TI);
Chris@42 443 T26 = VSUB(Td, Ta);
Chris@42 444 TJ = VADD(TF, TI);
Chris@42 445 TQ = VSUB(TO, TP);
Chris@42 446 T1m = VSUB(TR, TS);
Chris@42 447 T1n = VADD(TP, TO);
Chris@42 448 TT = VADD(TR, TS);
Chris@42 449 }
Chris@42 450 {
Chris@42 451 V Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
Chris@42 452 {
Chris@42 453 V Tg, Th, TX, TY;
Chris@42 454 Tg = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@42 455 Th = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
Chris@42 456 Ti = VADD(Tg, Th);
Chris@42 457 T11 = VSUB(Tg, Th);
Chris@42 458 TX = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@42 459 TY = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
Chris@42 460 TZ = VSUB(TX, TY);
Chris@42 461 T1V = VADD(TX, TY);
Chris@42 462 }
Chris@42 463 {
Chris@42 464 V Tj, Tk, T12, T13;
Chris@42 465 Tj = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@42 466 Tk = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
Chris@42 467 Tl = VADD(Tj, Tk);
Chris@42 468 TW = VSUB(Tj, Tk);
Chris@42 469 T12 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@42 470 T13 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
Chris@42 471 T14 = VSUB(T12, T13);
Chris@42 472 T1W = VADD(T12, T13);
Chris@42 473 }
Chris@42 474 Tm = VADD(Ti, Tl);
Chris@42 475 T1X = VSUB(T1V, T1W);
Chris@42 476 T2g = VADD(T1V, T1W);
Chris@42 477 T10 = VADD(TW, TZ);
Chris@42 478 T15 = VSUB(T11, T14);
Chris@42 479 T1B = VADD(T11, T14);
Chris@42 480 T1U = VSUB(Ti, Tl);
Chris@42 481 T1A = VSUB(TZ, TW);
Chris@42 482 }
Chris@42 483 {
Chris@42 484 V T2l, T2m, T2n, T2o, T2p, T2q, T2r, T2s;
Chris@42 485 {
Chris@42 486 V Tf, Tu, T2j, T2k;
Chris@42 487 Tf = VADD(T7, Te);
Chris@42 488 Tu = VADD(Tm, Tt);
Chris@42 489 T2l = VSUB(Tf, Tu);
Chris@42 490 STM4(&(ro[8]), T2l, ovs, &(ro[0]));
Chris@42 491 T2m = VADD(Tf, Tu);
Chris@42 492 STM4(&(ro[0]), T2m, ovs, &(ro[0]));
Chris@42 493 T2j = VADD(TC, TJ);
Chris@42 494 T2k = VADD(T2g, T2h);
Chris@42 495 T2n = VSUB(T2j, T2k);
Chris@42 496 STM4(&(io[8]), T2n, ovs, &(io[0]));
Chris@42 497 T2o = VADD(T2j, T2k);
Chris@42 498 STM4(&(io[0]), T2o, ovs, &(io[0]));
Chris@42 499 }
Chris@42 500 {
Chris@42 501 V Tv, TK, T2f, T2i;
Chris@42 502 Tv = VSUB(Tt, Tm);
Chris@42 503 TK = VSUB(TC, TJ);
Chris@42 504 T2p = VADD(Tv, TK);
Chris@42 505 STM4(&(io[4]), T2p, ovs, &(io[0]));
Chris@42 506 T2q = VSUB(TK, Tv);
Chris@42 507 STM4(&(io[12]), T2q, ovs, &(io[0]));
Chris@42 508 T2f = VSUB(T7, Te);
Chris@42 509 T2i = VSUB(T2g, T2h);
Chris@42 510 T2r = VSUB(T2f, T2i);
Chris@42 511 STM4(&(ro[12]), T2r, ovs, &(ro[0]));
Chris@42 512 T2s = VADD(T2f, T2i);
Chris@42 513 STM4(&(ro[4]), T2s, ovs, &(ro[0]));
Chris@42 514 }
Chris@42 515 {
Chris@42 516 V T2t, T2u, T2v, T2w, T2x, T2y, T2z, T2A;
Chris@42 517 {
Chris@42 518 V T1T, T27, T24, T28, T1Y, T23;
Chris@42 519 T1T = VADD(T1R, T1S);
Chris@42 520 T27 = VSUB(T25, T26);
Chris@42 521 T1Y = VADD(T1U, T1X);
Chris@42 522 T23 = VSUB(T1Z, T22);
Chris@42 523 T24 = VMUL(LDK(KP707106781), VADD(T1Y, T23));
Chris@42 524 T28 = VMUL(LDK(KP707106781), VSUB(T23, T1Y));
Chris@42 525 T2t = VSUB(T1T, T24);
Chris@42 526 STM4(&(ro[10]), T2t, ovs, &(ro[0]));
Chris@42 527 T2u = VADD(T27, T28);
Chris@42 528 STM4(&(io[6]), T2u, ovs, &(io[0]));
Chris@42 529 T2v = VADD(T1T, T24);
Chris@42 530 STM4(&(ro[2]), T2v, ovs, &(ro[0]));
Chris@42 531 T2w = VSUB(T27, T28);
Chris@42 532 STM4(&(io[14]), T2w, ovs, &(io[0]));
Chris@42 533 }
Chris@42 534 {
Chris@42 535 V T29, T2d, T2c, T2e, T2a, T2b;
Chris@42 536 T29 = VSUB(T1R, T1S);
Chris@42 537 T2d = VADD(T26, T25);
Chris@42 538 T2a = VSUB(T1X, T1U);
Chris@42 539 T2b = VADD(T1Z, T22);
Chris@42 540 T2c = VMUL(LDK(KP707106781), VSUB(T2a, T2b));
Chris@42 541 T2e = VMUL(LDK(KP707106781), VADD(T2a, T2b));
Chris@42 542 T2x = VSUB(T29, T2c);
Chris@42 543 STM4(&(ro[14]), T2x, ovs, &(ro[0]));
Chris@42 544 T2y = VADD(T2d, T2e);
Chris@42 545 STM4(&(io[2]), T2y, ovs, &(io[0]));
Chris@42 546 T2z = VADD(T29, T2c);
Chris@42 547 STM4(&(ro[6]), T2z, ovs, &(ro[0]));
Chris@42 548 T2A = VSUB(T2d, T2e);
Chris@42 549 STM4(&(io[10]), T2A, ovs, &(io[0]));
Chris@42 550 }
Chris@42 551 {
Chris@42 552 V T2B, T2C, T2D, T2E, T2F, T2G, T2H, T2I;
Chris@42 553 {
Chris@42 554 V TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
Chris@42 555 TU = VMUL(LDK(KP707106781), VSUB(TQ, TT));
Chris@42 556 TV = VADD(TN, TU);
Chris@42 557 T1r = VSUB(TN, TU);
Chris@42 558 T1o = VMUL(LDK(KP707106781), VSUB(T1m, T1n));
Chris@42 559 T1p = VSUB(T1l, T1o);
Chris@42 560 T1v = VADD(T1l, T1o);
Chris@42 561 {
Chris@42 562 V T16, T1h, T1s, T1t;
Chris@42 563 T16 = VFMA(LDK(KP923879532), T10, VMUL(LDK(KP382683432), T15));
Chris@42 564 T1h = VFNMS(LDK(KP923879532), T1g, VMUL(LDK(KP382683432), T1b));
Chris@42 565 T1i = VADD(T16, T1h);
Chris@42 566 T1q = VSUB(T1h, T16);
Chris@42 567 T1s = VFNMS(LDK(KP923879532), T15, VMUL(LDK(KP382683432), T10));
Chris@42 568 T1t = VFMA(LDK(KP382683432), T1g, VMUL(LDK(KP923879532), T1b));
Chris@42 569 T1u = VSUB(T1s, T1t);
Chris@42 570 T1w = VADD(T1s, T1t);
Chris@42 571 }
Chris@42 572 T2B = VSUB(TV, T1i);
Chris@42 573 STM4(&(ro[11]), T2B, ovs, &(ro[1]));
Chris@42 574 T2C = VSUB(T1v, T1w);
Chris@42 575 STM4(&(io[11]), T2C, ovs, &(io[1]));
Chris@42 576 T2D = VADD(TV, T1i);
Chris@42 577 STM4(&(ro[3]), T2D, ovs, &(ro[1]));
Chris@42 578 T2E = VADD(T1v, T1w);
Chris@42 579 STM4(&(io[3]), T2E, ovs, &(io[1]));
Chris@42 580 T2F = VSUB(T1p, T1q);
Chris@42 581 STM4(&(io[15]), T2F, ovs, &(io[1]));
Chris@42 582 T2G = VSUB(T1r, T1u);
Chris@42 583 STM4(&(ro[15]), T2G, ovs, &(ro[1]));
Chris@42 584 T2H = VADD(T1p, T1q);
Chris@42 585 STM4(&(io[7]), T2H, ovs, &(io[1]));
Chris@42 586 T2I = VADD(T1r, T1u);
Chris@42 587 STM4(&(ro[7]), T2I, ovs, &(ro[1]));
Chris@42 588 }
Chris@42 589 {
Chris@42 590 V T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
Chris@42 591 T1y = VMUL(LDK(KP707106781), VADD(T1n, T1m));
Chris@42 592 T1z = VADD(T1x, T1y);
Chris@42 593 T1L = VSUB(T1x, T1y);
Chris@42 594 T1I = VMUL(LDK(KP707106781), VADD(TQ, TT));
Chris@42 595 T1J = VSUB(T1H, T1I);
Chris@42 596 T1P = VADD(T1H, T1I);
Chris@42 597 {
Chris@42 598 V T1C, T1F, T1M, T1N;
Chris@42 599 T1C = VFMA(LDK(KP382683432), T1A, VMUL(LDK(KP923879532), T1B));
Chris@42 600 T1F = VFNMS(LDK(KP382683432), T1E, VMUL(LDK(KP923879532), T1D));
Chris@42 601 T1G = VADD(T1C, T1F);
Chris@42 602 T1K = VSUB(T1F, T1C);
Chris@42 603 T1M = VFNMS(LDK(KP382683432), T1B, VMUL(LDK(KP923879532), T1A));
Chris@42 604 T1N = VFMA(LDK(KP923879532), T1E, VMUL(LDK(KP382683432), T1D));
Chris@42 605 T1O = VSUB(T1M, T1N);
Chris@42 606 T1Q = VADD(T1M, T1N);
Chris@42 607 }
Chris@42 608 {
Chris@42 609 V T2J, T2K, T2L, T2M;
Chris@42 610 T2J = VSUB(T1z, T1G);
Chris@42 611 STM4(&(ro[9]), T2J, ovs, &(ro[1]));
Chris@42 612 STN4(&(ro[8]), T2l, T2J, T2t, T2B, ovs);
Chris@42 613 T2K = VSUB(T1P, T1Q);
Chris@42 614 STM4(&(io[9]), T2K, ovs, &(io[1]));
Chris@42 615 STN4(&(io[8]), T2n, T2K, T2A, T2C, ovs);
Chris@42 616 T2L = VADD(T1z, T1G);
Chris@42 617 STM4(&(ro[1]), T2L, ovs, &(ro[1]));
Chris@42 618 STN4(&(ro[0]), T2m, T2L, T2v, T2D, ovs);
Chris@42 619 T2M = VADD(T1P, T1Q);
Chris@42 620 STM4(&(io[1]), T2M, ovs, &(io[1]));
Chris@42 621 STN4(&(io[0]), T2o, T2M, T2y, T2E, ovs);
Chris@42 622 }
Chris@42 623 {
Chris@42 624 V T2N, T2O, T2P, T2Q;
Chris@42 625 T2N = VSUB(T1J, T1K);
Chris@42 626 STM4(&(io[13]), T2N, ovs, &(io[1]));
Chris@42 627 STN4(&(io[12]), T2q, T2N, T2w, T2F, ovs);
Chris@42 628 T2O = VSUB(T1L, T1O);
Chris@42 629 STM4(&(ro[13]), T2O, ovs, &(ro[1]));
Chris@42 630 STN4(&(ro[12]), T2r, T2O, T2x, T2G, ovs);
Chris@42 631 T2P = VADD(T1J, T1K);
Chris@42 632 STM4(&(io[5]), T2P, ovs, &(io[1]));
Chris@42 633 STN4(&(io[4]), T2p, T2P, T2u, T2H, ovs);
Chris@42 634 T2Q = VADD(T1L, T1O);
Chris@42 635 STM4(&(ro[5]), T2Q, ovs, &(ro[1]));
Chris@42 636 STN4(&(ro[4]), T2s, T2Q, T2z, T2I, ovs);
Chris@42 637 }
Chris@42 638 }
Chris@42 639 }
Chris@42 640 }
Chris@42 641 }
Chris@42 642 }
Chris@42 643 }
Chris@42 644 VLEAVE();
Chris@42 645 }
Chris@42 646
Chris@42 647 static const kdft_desc desc = { 16, XSIMD_STRING("n2sv_16"), {136, 16, 8, 0}, &GENUS, 0, 1, 0, 0 };
Chris@42 648
Chris@42 649 void XSIMD(codelet_n2sv_16) (planner *p) {
Chris@42 650 X(kdft_register) (p, n2sv_16, &desc);
Chris@42 651 }
Chris@42 652
Chris@42 653 #endif /* HAVE_FMA */