annotate src/fftw-3.3.8/dft/simd/common/q1bv_5.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:14 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 5 -dif -name q1bv_5 -include dft/simd/q1b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 100 FP additions, 95 FP multiplications,
Chris@82 32 * (or, 55 additions, 50 multiplications, 45 fused multiply/add),
Chris@82 33 * 44 stack variables, 4 constants, and 50 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/q1b.h"
Chris@82 36
Chris@82 37 static void q1bv_5(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 R *x;
Chris@82 46 x = ii;
Chris@82 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(10, vs)) {
Chris@82 48 V T1, Ta, Ti, Te, T8, T9, T1j, T1s, T1A, T1w, T1q, T1r, Tl, Tu, TC;
Chris@82 49 V Ty, Ts, Tt, TF, TO, TW, TS, TM, TN, TZ, T18, T1g, T1c, T16, T17;
Chris@82 50 {
Chris@82 51 V T7, Td, T4, Tc;
Chris@82 52 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 53 {
Chris@82 54 V T5, T6, T2, T3;
Chris@82 55 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 56 T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 57 T7 = VADD(T5, T6);
Chris@82 58 Td = VSUB(T5, T6);
Chris@82 59 T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 60 T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 61 T4 = VADD(T2, T3);
Chris@82 62 Tc = VSUB(T2, T3);
Chris@82 63 }
Chris@82 64 Ta = VSUB(T4, T7);
Chris@82 65 Ti = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tc, Td));
Chris@82 66 Te = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Td, Tc));
Chris@82 67 T8 = VADD(T4, T7);
Chris@82 68 T9 = VFNMS(LDK(KP250000000), T8, T1);
Chris@82 69 }
Chris@82 70 {
Chris@82 71 V T1p, T1v, T1m, T1u;
Chris@82 72 T1j = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 73 {
Chris@82 74 V T1n, T1o, T1k, T1l;
Chris@82 75 T1n = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@82 76 T1o = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 77 T1p = VADD(T1n, T1o);
Chris@82 78 T1v = VSUB(T1n, T1o);
Chris@82 79 T1k = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 80 T1l = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 81 T1m = VADD(T1k, T1l);
Chris@82 82 T1u = VSUB(T1k, T1l);
Chris@82 83 }
Chris@82 84 T1s = VSUB(T1m, T1p);
Chris@82 85 T1A = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1u, T1v));
Chris@82 86 T1w = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1v, T1u));
Chris@82 87 T1q = VADD(T1m, T1p);
Chris@82 88 T1r = VFNMS(LDK(KP250000000), T1q, T1j);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 V Tr, Tx, To, Tw;
Chris@82 92 Tl = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@82 93 {
Chris@82 94 V Tp, Tq, Tm, Tn;
Chris@82 95 Tp = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@82 96 Tq = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 97 Tr = VADD(Tp, Tq);
Chris@82 98 Tx = VSUB(Tp, Tq);
Chris@82 99 Tm = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 100 Tn = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@82 101 To = VADD(Tm, Tn);
Chris@82 102 Tw = VSUB(Tm, Tn);
Chris@82 103 }
Chris@82 104 Tu = VSUB(To, Tr);
Chris@82 105 TC = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tw, Tx));
Chris@82 106 Ty = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tx, Tw));
Chris@82 107 Ts = VADD(To, Tr);
Chris@82 108 Tt = VFNMS(LDK(KP250000000), Ts, Tl);
Chris@82 109 }
Chris@82 110 {
Chris@82 111 V TL, TR, TI, TQ;
Chris@82 112 TF = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 113 {
Chris@82 114 V TJ, TK, TG, TH;
Chris@82 115 TJ = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 116 TK = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 117 TL = VADD(TJ, TK);
Chris@82 118 TR = VSUB(TJ, TK);
Chris@82 119 TG = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 120 TH = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@82 121 TI = VADD(TG, TH);
Chris@82 122 TQ = VSUB(TG, TH);
Chris@82 123 }
Chris@82 124 TO = VSUB(TI, TL);
Chris@82 125 TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TQ, TR));
Chris@82 126 TS = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TR, TQ));
Chris@82 127 TM = VADD(TI, TL);
Chris@82 128 TN = VFNMS(LDK(KP250000000), TM, TF);
Chris@82 129 }
Chris@82 130 {
Chris@82 131 V T15, T1b, T12, T1a;
Chris@82 132 TZ = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@82 133 {
Chris@82 134 V T13, T14, T10, T11;
Chris@82 135 T13 = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@82 136 T14 = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 137 T15 = VADD(T13, T14);
Chris@82 138 T1b = VSUB(T13, T14);
Chris@82 139 T10 = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 140 T11 = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@82 141 T12 = VADD(T10, T11);
Chris@82 142 T1a = VSUB(T10, T11);
Chris@82 143 }
Chris@82 144 T18 = VSUB(T12, T15);
Chris@82 145 T1g = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1a, T1b));
Chris@82 146 T1c = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1b, T1a));
Chris@82 147 T16 = VADD(T12, T15);
Chris@82 148 T17 = VFNMS(LDK(KP250000000), T16, TZ);
Chris@82 149 }
Chris@82 150 ST(&(x[0]), VADD(T1, T8), ms, &(x[0]));
Chris@82 151 ST(&(x[WS(rs, 4)]), VADD(T1j, T1q), ms, &(x[0]));
Chris@82 152 ST(&(x[WS(rs, 2)]), VADD(TF, TM), ms, &(x[0]));
Chris@82 153 ST(&(x[WS(rs, 3)]), VADD(TZ, T16), ms, &(x[WS(rs, 1)]));
Chris@82 154 ST(&(x[WS(rs, 1)]), VADD(Tl, Ts), ms, &(x[WS(rs, 1)]));
Chris@82 155 {
Chris@82 156 V Tj, Tk, Th, T1B, T1C, T1z;
Chris@82 157 Th = VFNMS(LDK(KP559016994), Ta, T9);
Chris@82 158 Tj = BYTW(&(W[TWVL * 2]), VFNMSI(Ti, Th));
Chris@82 159 Tk = BYTW(&(W[TWVL * 4]), VFMAI(Ti, Th));
Chris@82 160 ST(&(x[WS(vs, 2)]), Tj, ms, &(x[WS(vs, 2)]));
Chris@82 161 ST(&(x[WS(vs, 3)]), Tk, ms, &(x[WS(vs, 3)]));
Chris@82 162 T1z = VFNMS(LDK(KP559016994), T1s, T1r);
Chris@82 163 T1B = BYTW(&(W[TWVL * 2]), VFNMSI(T1A, T1z));
Chris@82 164 T1C = BYTW(&(W[TWVL * 4]), VFMAI(T1A, T1z));
Chris@82 165 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T1B, ms, &(x[WS(vs, 2)]));
Chris@82 166 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T1C, ms, &(x[WS(vs, 3)]));
Chris@82 167 }
Chris@82 168 {
Chris@82 169 V T1h, T1i, T1f, TD, TE, TB;
Chris@82 170 T1f = VFNMS(LDK(KP559016994), T18, T17);
Chris@82 171 T1h = BYTW(&(W[TWVL * 2]), VFNMSI(T1g, T1f));
Chris@82 172 T1i = BYTW(&(W[TWVL * 4]), VFMAI(T1g, T1f));
Chris@82 173 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T1h, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 174 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1i, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 175 TB = VFNMS(LDK(KP559016994), Tu, Tt);
Chris@82 176 TD = BYTW(&(W[TWVL * 2]), VFNMSI(TC, TB));
Chris@82 177 TE = BYTW(&(W[TWVL * 4]), VFMAI(TC, TB));
Chris@82 178 ST(&(x[WS(vs, 2) + WS(rs, 1)]), TD, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 179 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 180 }
Chris@82 181 {
Chris@82 182 V TX, TY, TV, TT, TU, TP;
Chris@82 183 TV = VFNMS(LDK(KP559016994), TO, TN);
Chris@82 184 TX = BYTW(&(W[TWVL * 2]), VFNMSI(TW, TV));
Chris@82 185 TY = BYTW(&(W[TWVL * 4]), VFMAI(TW, TV));
Chris@82 186 ST(&(x[WS(vs, 2) + WS(rs, 2)]), TX, ms, &(x[WS(vs, 2)]));
Chris@82 187 ST(&(x[WS(vs, 3) + WS(rs, 2)]), TY, ms, &(x[WS(vs, 3)]));
Chris@82 188 TP = VFMA(LDK(KP559016994), TO, TN);
Chris@82 189 TT = BYTW(&(W[0]), VFMAI(TS, TP));
Chris@82 190 TU = BYTW(&(W[TWVL * 6]), VFNMSI(TS, TP));
Chris@82 191 ST(&(x[WS(vs, 1) + WS(rs, 2)]), TT, ms, &(x[WS(vs, 1)]));
Chris@82 192 ST(&(x[WS(vs, 4) + WS(rs, 2)]), TU, ms, &(x[WS(vs, 4)]));
Chris@82 193 }
Chris@82 194 {
Chris@82 195 V Tf, Tg, Tb, Tz, TA, Tv;
Chris@82 196 Tb = VFMA(LDK(KP559016994), Ta, T9);
Chris@82 197 Tf = BYTW(&(W[0]), VFMAI(Te, Tb));
Chris@82 198 Tg = BYTW(&(W[TWVL * 6]), VFNMSI(Te, Tb));
Chris@82 199 ST(&(x[WS(vs, 1)]), Tf, ms, &(x[WS(vs, 1)]));
Chris@82 200 ST(&(x[WS(vs, 4)]), Tg, ms, &(x[WS(vs, 4)]));
Chris@82 201 Tv = VFMA(LDK(KP559016994), Tu, Tt);
Chris@82 202 Tz = BYTW(&(W[0]), VFMAI(Ty, Tv));
Chris@82 203 TA = BYTW(&(W[TWVL * 6]), VFNMSI(Ty, Tv));
Chris@82 204 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tz, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 205 ST(&(x[WS(vs, 4) + WS(rs, 1)]), TA, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 206 }
Chris@82 207 {
Chris@82 208 V T1d, T1e, T19, T1x, T1y, T1t;
Chris@82 209 T19 = VFMA(LDK(KP559016994), T18, T17);
Chris@82 210 T1d = BYTW(&(W[0]), VFMAI(T1c, T19));
Chris@82 211 T1e = BYTW(&(W[TWVL * 6]), VFNMSI(T1c, T19));
Chris@82 212 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1d, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 213 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T1e, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 214 T1t = VFMA(LDK(KP559016994), T1s, T1r);
Chris@82 215 T1x = BYTW(&(W[0]), VFMAI(T1w, T1t));
Chris@82 216 T1y = BYTW(&(W[TWVL * 6]), VFNMSI(T1w, T1t));
Chris@82 217 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T1x, ms, &(x[WS(vs, 1)]));
Chris@82 218 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T1y, ms, &(x[WS(vs, 4)]));
Chris@82 219 }
Chris@82 220 }
Chris@82 221 }
Chris@82 222 VLEAVE();
Chris@82 223 }
Chris@82 224
Chris@82 225 static const tw_instr twinstr[] = {
Chris@82 226 VTW(0, 1),
Chris@82 227 VTW(0, 2),
Chris@82 228 VTW(0, 3),
Chris@82 229 VTW(0, 4),
Chris@82 230 {TW_NEXT, VL, 0}
Chris@82 231 };
Chris@82 232
Chris@82 233 static const ct_desc desc = { 5, XSIMD_STRING("q1bv_5"), twinstr, &GENUS, {55, 50, 45, 0}, 0, 0, 0 };
Chris@82 234
Chris@82 235 void XSIMD(codelet_q1bv_5) (planner *p) {
Chris@82 236 X(kdft_difsq_register) (p, q1bv_5, &desc);
Chris@82 237 }
Chris@82 238 #else
Chris@82 239
Chris@82 240 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -dif -name q1bv_5 -include dft/simd/q1b.h -sign 1 */
Chris@82 241
Chris@82 242 /*
Chris@82 243 * This function contains 100 FP additions, 70 FP multiplications,
Chris@82 244 * (or, 85 additions, 55 multiplications, 15 fused multiply/add),
Chris@82 245 * 44 stack variables, 4 constants, and 50 memory accesses
Chris@82 246 */
Chris@82 247 #include "dft/simd/q1b.h"
Chris@82 248
Chris@82 249 static void q1bv_5(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 250 {
Chris@82 251 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 252 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 253 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 254 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 255 {
Chris@82 256 INT m;
Chris@82 257 R *x;
Chris@82 258 x = ii;
Chris@82 259 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(10, vs)) {
Chris@82 260 V Tb, T7, Th, Ta, Tc, Td, T1t, T1p, T1z, T1s, T1u, T1v, Tv, Tr, TB;
Chris@82 261 V Tu, Tw, Tx, TP, TL, TV, TO, TQ, TR, T19, T15, T1f, T18, T1a, T1b;
Chris@82 262 {
Chris@82 263 V T6, T9, T3, T8;
Chris@82 264 Tb = LD(&(x[0]), ms, &(x[0]));
Chris@82 265 {
Chris@82 266 V T4, T5, T1, T2;
Chris@82 267 T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 268 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 269 T6 = VSUB(T4, T5);
Chris@82 270 T9 = VADD(T4, T5);
Chris@82 271 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 272 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 273 T3 = VSUB(T1, T2);
Chris@82 274 T8 = VADD(T1, T2);
Chris@82 275 }
Chris@82 276 T7 = VBYI(VFMA(LDK(KP951056516), T3, VMUL(LDK(KP587785252), T6)));
Chris@82 277 Th = VBYI(VFNMS(LDK(KP951056516), T6, VMUL(LDK(KP587785252), T3)));
Chris@82 278 Ta = VMUL(LDK(KP559016994), VSUB(T8, T9));
Chris@82 279 Tc = VADD(T8, T9);
Chris@82 280 Td = VFNMS(LDK(KP250000000), Tc, Tb);
Chris@82 281 }
Chris@82 282 {
Chris@82 283 V T1o, T1r, T1l, T1q;
Chris@82 284 T1t = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 285 {
Chris@82 286 V T1m, T1n, T1j, T1k;
Chris@82 287 T1m = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@82 288 T1n = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 289 T1o = VSUB(T1m, T1n);
Chris@82 290 T1r = VADD(T1m, T1n);
Chris@82 291 T1j = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 292 T1k = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 293 T1l = VSUB(T1j, T1k);
Chris@82 294 T1q = VADD(T1j, T1k);
Chris@82 295 }
Chris@82 296 T1p = VBYI(VFMA(LDK(KP951056516), T1l, VMUL(LDK(KP587785252), T1o)));
Chris@82 297 T1z = VBYI(VFNMS(LDK(KP951056516), T1o, VMUL(LDK(KP587785252), T1l)));
Chris@82 298 T1s = VMUL(LDK(KP559016994), VSUB(T1q, T1r));
Chris@82 299 T1u = VADD(T1q, T1r);
Chris@82 300 T1v = VFNMS(LDK(KP250000000), T1u, T1t);
Chris@82 301 }
Chris@82 302 {
Chris@82 303 V Tq, Tt, Tn, Ts;
Chris@82 304 Tv = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@82 305 {
Chris@82 306 V To, Tp, Tl, Tm;
Chris@82 307 To = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@82 308 Tp = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 309 Tq = VSUB(To, Tp);
Chris@82 310 Tt = VADD(To, Tp);
Chris@82 311 Tl = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 312 Tm = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@82 313 Tn = VSUB(Tl, Tm);
Chris@82 314 Ts = VADD(Tl, Tm);
Chris@82 315 }
Chris@82 316 Tr = VBYI(VFMA(LDK(KP951056516), Tn, VMUL(LDK(KP587785252), Tq)));
Chris@82 317 TB = VBYI(VFNMS(LDK(KP951056516), Tq, VMUL(LDK(KP587785252), Tn)));
Chris@82 318 Tu = VMUL(LDK(KP559016994), VSUB(Ts, Tt));
Chris@82 319 Tw = VADD(Ts, Tt);
Chris@82 320 Tx = VFNMS(LDK(KP250000000), Tw, Tv);
Chris@82 321 }
Chris@82 322 {
Chris@82 323 V TK, TN, TH, TM;
Chris@82 324 TP = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 325 {
Chris@82 326 V TI, TJ, TF, TG;
Chris@82 327 TI = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 328 TJ = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 329 TK = VSUB(TI, TJ);
Chris@82 330 TN = VADD(TI, TJ);
Chris@82 331 TF = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 332 TG = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@82 333 TH = VSUB(TF, TG);
Chris@82 334 TM = VADD(TF, TG);
Chris@82 335 }
Chris@82 336 TL = VBYI(VFMA(LDK(KP951056516), TH, VMUL(LDK(KP587785252), TK)));
Chris@82 337 TV = VBYI(VFNMS(LDK(KP951056516), TK, VMUL(LDK(KP587785252), TH)));
Chris@82 338 TO = VMUL(LDK(KP559016994), VSUB(TM, TN));
Chris@82 339 TQ = VADD(TM, TN);
Chris@82 340 TR = VFNMS(LDK(KP250000000), TQ, TP);
Chris@82 341 }
Chris@82 342 {
Chris@82 343 V T14, T17, T11, T16;
Chris@82 344 T19 = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@82 345 {
Chris@82 346 V T12, T13, TZ, T10;
Chris@82 347 T12 = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@82 348 T13 = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 349 T14 = VSUB(T12, T13);
Chris@82 350 T17 = VADD(T12, T13);
Chris@82 351 TZ = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 352 T10 = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@82 353 T11 = VSUB(TZ, T10);
Chris@82 354 T16 = VADD(TZ, T10);
Chris@82 355 }
Chris@82 356 T15 = VBYI(VFMA(LDK(KP951056516), T11, VMUL(LDK(KP587785252), T14)));
Chris@82 357 T1f = VBYI(VFNMS(LDK(KP951056516), T14, VMUL(LDK(KP587785252), T11)));
Chris@82 358 T18 = VMUL(LDK(KP559016994), VSUB(T16, T17));
Chris@82 359 T1a = VADD(T16, T17);
Chris@82 360 T1b = VFNMS(LDK(KP250000000), T1a, T19);
Chris@82 361 }
Chris@82 362 ST(&(x[0]), VADD(Tb, Tc), ms, &(x[0]));
Chris@82 363 ST(&(x[WS(rs, 4)]), VADD(T1t, T1u), ms, &(x[0]));
Chris@82 364 ST(&(x[WS(rs, 2)]), VADD(TP, TQ), ms, &(x[0]));
Chris@82 365 ST(&(x[WS(rs, 3)]), VADD(T19, T1a), ms, &(x[WS(rs, 1)]));
Chris@82 366 ST(&(x[WS(rs, 1)]), VADD(Tv, Tw), ms, &(x[WS(rs, 1)]));
Chris@82 367 {
Chris@82 368 V Tj, Tk, Ti, T1B, T1C, T1A;
Chris@82 369 Ti = VSUB(Td, Ta);
Chris@82 370 Tj = BYTW(&(W[TWVL * 2]), VADD(Th, Ti));
Chris@82 371 Tk = BYTW(&(W[TWVL * 4]), VSUB(Ti, Th));
Chris@82 372 ST(&(x[WS(vs, 2)]), Tj, ms, &(x[WS(vs, 2)]));
Chris@82 373 ST(&(x[WS(vs, 3)]), Tk, ms, &(x[WS(vs, 3)]));
Chris@82 374 T1A = VSUB(T1v, T1s);
Chris@82 375 T1B = BYTW(&(W[TWVL * 2]), VADD(T1z, T1A));
Chris@82 376 T1C = BYTW(&(W[TWVL * 4]), VSUB(T1A, T1z));
Chris@82 377 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T1B, ms, &(x[WS(vs, 2)]));
Chris@82 378 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T1C, ms, &(x[WS(vs, 3)]));
Chris@82 379 }
Chris@82 380 {
Chris@82 381 V T1h, T1i, T1g, TD, TE, TC;
Chris@82 382 T1g = VSUB(T1b, T18);
Chris@82 383 T1h = BYTW(&(W[TWVL * 2]), VADD(T1f, T1g));
Chris@82 384 T1i = BYTW(&(W[TWVL * 4]), VSUB(T1g, T1f));
Chris@82 385 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T1h, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 386 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1i, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 387 TC = VSUB(Tx, Tu);
Chris@82 388 TD = BYTW(&(W[TWVL * 2]), VADD(TB, TC));
Chris@82 389 TE = BYTW(&(W[TWVL * 4]), VSUB(TC, TB));
Chris@82 390 ST(&(x[WS(vs, 2) + WS(rs, 1)]), TD, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 391 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 392 }
Chris@82 393 {
Chris@82 394 V TX, TY, TW, TT, TU, TS;
Chris@82 395 TW = VSUB(TR, TO);
Chris@82 396 TX = BYTW(&(W[TWVL * 2]), VADD(TV, TW));
Chris@82 397 TY = BYTW(&(W[TWVL * 4]), VSUB(TW, TV));
Chris@82 398 ST(&(x[WS(vs, 2) + WS(rs, 2)]), TX, ms, &(x[WS(vs, 2)]));
Chris@82 399 ST(&(x[WS(vs, 3) + WS(rs, 2)]), TY, ms, &(x[WS(vs, 3)]));
Chris@82 400 TS = VADD(TO, TR);
Chris@82 401 TT = BYTW(&(W[0]), VADD(TL, TS));
Chris@82 402 TU = BYTW(&(W[TWVL * 6]), VSUB(TS, TL));
Chris@82 403 ST(&(x[WS(vs, 1) + WS(rs, 2)]), TT, ms, &(x[WS(vs, 1)]));
Chris@82 404 ST(&(x[WS(vs, 4) + WS(rs, 2)]), TU, ms, &(x[WS(vs, 4)]));
Chris@82 405 }
Chris@82 406 {
Chris@82 407 V Tf, Tg, Te, Tz, TA, Ty;
Chris@82 408 Te = VADD(Ta, Td);
Chris@82 409 Tf = BYTW(&(W[0]), VADD(T7, Te));
Chris@82 410 Tg = BYTW(&(W[TWVL * 6]), VSUB(Te, T7));
Chris@82 411 ST(&(x[WS(vs, 1)]), Tf, ms, &(x[WS(vs, 1)]));
Chris@82 412 ST(&(x[WS(vs, 4)]), Tg, ms, &(x[WS(vs, 4)]));
Chris@82 413 Ty = VADD(Tu, Tx);
Chris@82 414 Tz = BYTW(&(W[0]), VADD(Tr, Ty));
Chris@82 415 TA = BYTW(&(W[TWVL * 6]), VSUB(Ty, Tr));
Chris@82 416 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tz, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 417 ST(&(x[WS(vs, 4) + WS(rs, 1)]), TA, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 418 }
Chris@82 419 {
Chris@82 420 V T1d, T1e, T1c, T1x, T1y, T1w;
Chris@82 421 T1c = VADD(T18, T1b);
Chris@82 422 T1d = BYTW(&(W[0]), VADD(T15, T1c));
Chris@82 423 T1e = BYTW(&(W[TWVL * 6]), VSUB(T1c, T15));
Chris@82 424 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1d, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 425 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T1e, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 426 T1w = VADD(T1s, T1v);
Chris@82 427 T1x = BYTW(&(W[0]), VADD(T1p, T1w));
Chris@82 428 T1y = BYTW(&(W[TWVL * 6]), VSUB(T1w, T1p));
Chris@82 429 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T1x, ms, &(x[WS(vs, 1)]));
Chris@82 430 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T1y, ms, &(x[WS(vs, 4)]));
Chris@82 431 }
Chris@82 432 }
Chris@82 433 }
Chris@82 434 VLEAVE();
Chris@82 435 }
Chris@82 436
Chris@82 437 static const tw_instr twinstr[] = {
Chris@82 438 VTW(0, 1),
Chris@82 439 VTW(0, 2),
Chris@82 440 VTW(0, 3),
Chris@82 441 VTW(0, 4),
Chris@82 442 {TW_NEXT, VL, 0}
Chris@82 443 };
Chris@82 444
Chris@82 445 static const ct_desc desc = { 5, XSIMD_STRING("q1bv_5"), twinstr, &GENUS, {85, 55, 15, 0}, 0, 0, 0 };
Chris@82 446
Chris@82 447 void XSIMD(codelet_q1bv_5) (planner *p) {
Chris@82 448 X(kdft_difsq_register) (p, q1bv_5, &desc);
Chris@82 449 }
Chris@82 450 #endif