annotate src/fftw-3.3.3/dft/simd/common/q1fv_5.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:31 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 5 -dif -name q1fv_5 -include q1f.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 100 FP additions, 95 FP multiplications,
Chris@10 32 * (or, 55 additions, 50 multiplications, 45 fused multiply/add),
Chris@10 33 * 69 stack variables, 4 constants, and 50 memory accesses
Chris@10 34 */
Chris@10 35 #include "q1f.h"
Chris@10 36
Chris@10 37 static void q1fv_5(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 43 {
Chris@10 44 INT m;
Chris@10 45 R *x;
Chris@10 46 x = ri;
Chris@10 47 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(10, vs)) {
Chris@10 48 V Te, T1w, Ty, TS, TW, Tb, T1t, Tv, T1g, T1c, TP, TV, T1f, T19, TY;
Chris@10 49 V TX;
Chris@10 50 {
Chris@10 51 V T1, T1j, Tl, Ti, Ta, T8, T1A, T1q, T1s, T9, TF, T1r, TZ, TR, TL;
Chris@10 52 V TC, Ts, Tu, TQ, TI, T15, T1b, T10, T11, Tt;
Chris@10 53 {
Chris@10 54 V T1n, T1o, T1k, T1l, T7, Td, T4, Tc;
Chris@10 55 {
Chris@10 56 V T5, T6, T2, T3;
Chris@10 57 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 58 T5 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 59 T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 60 T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 61 T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 62 T1j = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@10 63 T1n = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@10 64 T1o = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 65 T1k = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 66 T1l = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@10 67 T7 = VADD(T5, T6);
Chris@10 68 Td = VSUB(T5, T6);
Chris@10 69 T4 = VADD(T2, T3);
Chris@10 70 Tc = VSUB(T2, T3);
Chris@10 71 }
Chris@10 72 {
Chris@10 73 V Tm, Tn, Tr, Tx, T1v, T1p;
Chris@10 74 Tl = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@10 75 T1v = VSUB(T1n, T1o);
Chris@10 76 T1p = VADD(T1n, T1o);
Chris@10 77 {
Chris@10 78 V T1u, T1m, Tp, Tq;
Chris@10 79 T1u = VSUB(T1k, T1l);
Chris@10 80 T1m = VADD(T1k, T1l);
Chris@10 81 Tp = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@10 82 Ti = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tc, Td));
Chris@10 83 Te = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Td, Tc));
Chris@10 84 Ta = VSUB(T4, T7);
Chris@10 85 T8 = VADD(T4, T7);
Chris@10 86 Tq = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 87 T1w = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1v, T1u));
Chris@10 88 T1A = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1u, T1v));
Chris@10 89 T1q = VADD(T1m, T1p);
Chris@10 90 T1s = VSUB(T1m, T1p);
Chris@10 91 Tm = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 92 T9 = VFNMS(LDK(KP250000000), T8, T1);
Chris@10 93 Tn = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@10 94 Tr = VADD(Tp, Tq);
Chris@10 95 Tx = VSUB(Tp, Tq);
Chris@10 96 }
Chris@10 97 {
Chris@10 98 V TJ, TK, TG, Tw, To, TH, T13, T14;
Chris@10 99 TF = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@10 100 T1r = VFNMS(LDK(KP250000000), T1q, T1j);
Chris@10 101 TJ = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@10 102 TK = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 103 TG = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 104 Tw = VSUB(Tm, Tn);
Chris@10 105 To = VADD(Tm, Tn);
Chris@10 106 TH = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@10 107 TZ = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@10 108 T13 = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@10 109 T14 = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 110 TR = VSUB(TJ, TK);
Chris@10 111 TL = VADD(TJ, TK);
Chris@10 112 Ty = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tx, Tw));
Chris@10 113 TC = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tw, Tx));
Chris@10 114 Ts = VADD(To, Tr);
Chris@10 115 Tu = VSUB(To, Tr);
Chris@10 116 TQ = VSUB(TG, TH);
Chris@10 117 TI = VADD(TG, TH);
Chris@10 118 T15 = VADD(T13, T14);
Chris@10 119 T1b = VSUB(T13, T14);
Chris@10 120 T10 = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 121 T11 = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@10 122 Tt = VFNMS(LDK(KP250000000), Ts, Tl);
Chris@10 123 }
Chris@10 124 }
Chris@10 125 }
Chris@10 126 {
Chris@10 127 V TO, T12, T1a, Th, T1z, TN, TM, T18, T17;
Chris@10 128 ST(&(x[0]), VADD(T1, T8), ms, &(x[0]));
Chris@10 129 TS = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TR, TQ));
Chris@10 130 TW = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TQ, TR));
Chris@10 131 TM = VADD(TI, TL);
Chris@10 132 TO = VSUB(TI, TL);
Chris@10 133 ST(&(x[WS(rs, 4)]), VADD(T1j, T1q), ms, &(x[0]));
Chris@10 134 T12 = VADD(T10, T11);
Chris@10 135 T1a = VSUB(T10, T11);
Chris@10 136 ST(&(x[WS(rs, 1)]), VADD(Tl, Ts), ms, &(x[WS(rs, 1)]));
Chris@10 137 Th = VFNMS(LDK(KP559016994), Ta, T9);
Chris@10 138 Tb = VFMA(LDK(KP559016994), Ta, T9);
Chris@10 139 T1t = VFMA(LDK(KP559016994), T1s, T1r);
Chris@10 140 T1z = VFNMS(LDK(KP559016994), T1s, T1r);
Chris@10 141 ST(&(x[WS(rs, 2)]), VADD(TF, TM), ms, &(x[0]));
Chris@10 142 TN = VFNMS(LDK(KP250000000), TM, TF);
Chris@10 143 {
Chris@10 144 V T16, Tk, Tj, T1C, T1B, TD, TE, TB;
Chris@10 145 TB = VFNMS(LDK(KP559016994), Tu, Tt);
Chris@10 146 Tv = VFMA(LDK(KP559016994), Tu, Tt);
Chris@10 147 T1g = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1a, T1b));
Chris@10 148 T1c = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1b, T1a));
Chris@10 149 T18 = VSUB(T12, T15);
Chris@10 150 T16 = VADD(T12, T15);
Chris@10 151 Tk = BYTWJ(&(W[TWVL * 4]), VFNMSI(Ti, Th));
Chris@10 152 Tj = BYTWJ(&(W[TWVL * 2]), VFMAI(Ti, Th));
Chris@10 153 T1C = BYTWJ(&(W[TWVL * 4]), VFNMSI(T1A, T1z));
Chris@10 154 T1B = BYTWJ(&(W[TWVL * 2]), VFMAI(T1A, T1z));
Chris@10 155 TD = BYTWJ(&(W[TWVL * 2]), VFMAI(TC, TB));
Chris@10 156 TE = BYTWJ(&(W[TWVL * 4]), VFNMSI(TC, TB));
Chris@10 157 ST(&(x[WS(rs, 3)]), VADD(TZ, T16), ms, &(x[WS(rs, 1)]));
Chris@10 158 T17 = VFNMS(LDK(KP250000000), T16, TZ);
Chris@10 159 ST(&(x[WS(vs, 3)]), Tk, ms, &(x[WS(vs, 3)]));
Chris@10 160 ST(&(x[WS(vs, 2)]), Tj, ms, &(x[WS(vs, 2)]));
Chris@10 161 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T1C, ms, &(x[WS(vs, 3)]));
Chris@10 162 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T1B, ms, &(x[WS(vs, 2)]));
Chris@10 163 ST(&(x[WS(vs, 2) + WS(rs, 1)]), TD, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 164 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 165 }
Chris@10 166 TP = VFMA(LDK(KP559016994), TO, TN);
Chris@10 167 TV = VFNMS(LDK(KP559016994), TO, TN);
Chris@10 168 T1f = VFNMS(LDK(KP559016994), T18, T17);
Chris@10 169 T19 = VFMA(LDK(KP559016994), T18, T17);
Chris@10 170 }
Chris@10 171 }
Chris@10 172 TY = BYTWJ(&(W[TWVL * 4]), VFNMSI(TW, TV));
Chris@10 173 TX = BYTWJ(&(W[TWVL * 2]), VFMAI(TW, TV));
Chris@10 174 {
Chris@10 175 V T1i, T1h, TU, TT;
Chris@10 176 T1i = BYTWJ(&(W[TWVL * 4]), VFNMSI(T1g, T1f));
Chris@10 177 T1h = BYTWJ(&(W[TWVL * 2]), VFMAI(T1g, T1f));
Chris@10 178 TU = BYTWJ(&(W[TWVL * 6]), VFMAI(TS, TP));
Chris@10 179 TT = BYTWJ(&(W[0]), VFNMSI(TS, TP));
Chris@10 180 {
Chris@10 181 V Tg, Tf, TA, Tz;
Chris@10 182 Tg = BYTWJ(&(W[TWVL * 6]), VFMAI(Te, Tb));
Chris@10 183 Tf = BYTWJ(&(W[0]), VFNMSI(Te, Tb));
Chris@10 184 TA = BYTWJ(&(W[TWVL * 6]), VFMAI(Ty, Tv));
Chris@10 185 Tz = BYTWJ(&(W[0]), VFNMSI(Ty, Tv));
Chris@10 186 {
Chris@10 187 V T1e, T1d, T1y, T1x;
Chris@10 188 T1e = BYTWJ(&(W[TWVL * 6]), VFMAI(T1c, T19));
Chris@10 189 T1d = BYTWJ(&(W[0]), VFNMSI(T1c, T19));
Chris@10 190 T1y = BYTWJ(&(W[TWVL * 6]), VFMAI(T1w, T1t));
Chris@10 191 T1x = BYTWJ(&(W[0]), VFNMSI(T1w, T1t));
Chris@10 192 ST(&(x[WS(vs, 3) + WS(rs, 2)]), TY, ms, &(x[WS(vs, 3)]));
Chris@10 193 ST(&(x[WS(vs, 2) + WS(rs, 2)]), TX, ms, &(x[WS(vs, 2)]));
Chris@10 194 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1i, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 195 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T1h, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 196 ST(&(x[WS(vs, 4) + WS(rs, 2)]), TU, ms, &(x[WS(vs, 4)]));
Chris@10 197 ST(&(x[WS(vs, 1) + WS(rs, 2)]), TT, ms, &(x[WS(vs, 1)]));
Chris@10 198 ST(&(x[WS(vs, 4)]), Tg, ms, &(x[WS(vs, 4)]));
Chris@10 199 ST(&(x[WS(vs, 1)]), Tf, ms, &(x[WS(vs, 1)]));
Chris@10 200 ST(&(x[WS(vs, 4) + WS(rs, 1)]), TA, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 201 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tz, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 202 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T1e, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 203 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1d, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 204 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T1y, ms, &(x[WS(vs, 4)]));
Chris@10 205 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T1x, ms, &(x[WS(vs, 1)]));
Chris@10 206 }
Chris@10 207 }
Chris@10 208 }
Chris@10 209 }
Chris@10 210 }
Chris@10 211 VLEAVE();
Chris@10 212 }
Chris@10 213
Chris@10 214 static const tw_instr twinstr[] = {
Chris@10 215 VTW(0, 1),
Chris@10 216 VTW(0, 2),
Chris@10 217 VTW(0, 3),
Chris@10 218 VTW(0, 4),
Chris@10 219 {TW_NEXT, VL, 0}
Chris@10 220 };
Chris@10 221
Chris@10 222 static const ct_desc desc = { 5, XSIMD_STRING("q1fv_5"), twinstr, &GENUS, {55, 50, 45, 0}, 0, 0, 0 };
Chris@10 223
Chris@10 224 void XSIMD(codelet_q1fv_5) (planner *p) {
Chris@10 225 X(kdft_difsq_register) (p, q1fv_5, &desc);
Chris@10 226 }
Chris@10 227 #else /* HAVE_FMA */
Chris@10 228
Chris@10 229 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 5 -dif -name q1fv_5 -include q1f.h */
Chris@10 230
Chris@10 231 /*
Chris@10 232 * This function contains 100 FP additions, 70 FP multiplications,
Chris@10 233 * (or, 85 additions, 55 multiplications, 15 fused multiply/add),
Chris@10 234 * 44 stack variables, 4 constants, and 50 memory accesses
Chris@10 235 */
Chris@10 236 #include "q1f.h"
Chris@10 237
Chris@10 238 static void q1fv_5(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@10 239 {
Chris@10 240 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 241 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 242 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 243 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 244 {
Chris@10 245 INT m;
Chris@10 246 R *x;
Chris@10 247 x = ri;
Chris@10 248 for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(10, vs)) {
Chris@10 249 V T8, T7, Th, Te, T9, Ta, T1q, T1p, T1z, T1w, T1r, T1s, Ts, Tr, TB;
Chris@10 250 V Ty, Tt, Tu, TM, TL, TV, TS, TN, TO, T16, T15, T1f, T1c, T17, T18;
Chris@10 251 {
Chris@10 252 V T6, Td, T3, Tc;
Chris@10 253 T8 = LD(&(x[0]), ms, &(x[0]));
Chris@10 254 {
Chris@10 255 V T4, T5, T1, T2;
Chris@10 256 T4 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 257 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 258 T6 = VADD(T4, T5);
Chris@10 259 Td = VSUB(T4, T5);
Chris@10 260 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 261 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 262 T3 = VADD(T1, T2);
Chris@10 263 Tc = VSUB(T1, T2);
Chris@10 264 }
Chris@10 265 T7 = VMUL(LDK(KP559016994), VSUB(T3, T6));
Chris@10 266 Th = VBYI(VFNMS(LDK(KP587785252), Tc, VMUL(LDK(KP951056516), Td)));
Chris@10 267 Te = VBYI(VFMA(LDK(KP951056516), Tc, VMUL(LDK(KP587785252), Td)));
Chris@10 268 T9 = VADD(T3, T6);
Chris@10 269 Ta = VFNMS(LDK(KP250000000), T9, T8);
Chris@10 270 }
Chris@10 271 {
Chris@10 272 V T1o, T1v, T1l, T1u;
Chris@10 273 T1q = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@10 274 {
Chris@10 275 V T1m, T1n, T1j, T1k;
Chris@10 276 T1m = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@10 277 T1n = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 278 T1o = VADD(T1m, T1n);
Chris@10 279 T1v = VSUB(T1m, T1n);
Chris@10 280 T1j = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 281 T1k = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@10 282 T1l = VADD(T1j, T1k);
Chris@10 283 T1u = VSUB(T1j, T1k);
Chris@10 284 }
Chris@10 285 T1p = VMUL(LDK(KP559016994), VSUB(T1l, T1o));
Chris@10 286 T1z = VBYI(VFNMS(LDK(KP587785252), T1u, VMUL(LDK(KP951056516), T1v)));
Chris@10 287 T1w = VBYI(VFMA(LDK(KP951056516), T1u, VMUL(LDK(KP587785252), T1v)));
Chris@10 288 T1r = VADD(T1l, T1o);
Chris@10 289 T1s = VFNMS(LDK(KP250000000), T1r, T1q);
Chris@10 290 }
Chris@10 291 {
Chris@10 292 V Tq, Tx, Tn, Tw;
Chris@10 293 Ts = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@10 294 {
Chris@10 295 V To, Tp, Tl, Tm;
Chris@10 296 To = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@10 297 Tp = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 298 Tq = VADD(To, Tp);
Chris@10 299 Tx = VSUB(To, Tp);
Chris@10 300 Tl = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 301 Tm = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@10 302 Tn = VADD(Tl, Tm);
Chris@10 303 Tw = VSUB(Tl, Tm);
Chris@10 304 }
Chris@10 305 Tr = VMUL(LDK(KP559016994), VSUB(Tn, Tq));
Chris@10 306 TB = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tx)));
Chris@10 307 Ty = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tx)));
Chris@10 308 Tt = VADD(Tn, Tq);
Chris@10 309 Tu = VFNMS(LDK(KP250000000), Tt, Ts);
Chris@10 310 }
Chris@10 311 {
Chris@10 312 V TK, TR, TH, TQ;
Chris@10 313 TM = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@10 314 {
Chris@10 315 V TI, TJ, TF, TG;
Chris@10 316 TI = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@10 317 TJ = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 318 TK = VADD(TI, TJ);
Chris@10 319 TR = VSUB(TI, TJ);
Chris@10 320 TF = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 321 TG = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@10 322 TH = VADD(TF, TG);
Chris@10 323 TQ = VSUB(TF, TG);
Chris@10 324 }
Chris@10 325 TL = VMUL(LDK(KP559016994), VSUB(TH, TK));
Chris@10 326 TV = VBYI(VFNMS(LDK(KP587785252), TQ, VMUL(LDK(KP951056516), TR)));
Chris@10 327 TS = VBYI(VFMA(LDK(KP951056516), TQ, VMUL(LDK(KP587785252), TR)));
Chris@10 328 TN = VADD(TH, TK);
Chris@10 329 TO = VFNMS(LDK(KP250000000), TN, TM);
Chris@10 330 }
Chris@10 331 {
Chris@10 332 V T14, T1b, T11, T1a;
Chris@10 333 T16 = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@10 334 {
Chris@10 335 V T12, T13, TZ, T10;
Chris@10 336 T12 = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@10 337 T13 = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 338 T14 = VADD(T12, T13);
Chris@10 339 T1b = VSUB(T12, T13);
Chris@10 340 TZ = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 341 T10 = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@10 342 T11 = VADD(TZ, T10);
Chris@10 343 T1a = VSUB(TZ, T10);
Chris@10 344 }
Chris@10 345 T15 = VMUL(LDK(KP559016994), VSUB(T11, T14));
Chris@10 346 T1f = VBYI(VFNMS(LDK(KP587785252), T1a, VMUL(LDK(KP951056516), T1b)));
Chris@10 347 T1c = VBYI(VFMA(LDK(KP951056516), T1a, VMUL(LDK(KP587785252), T1b)));
Chris@10 348 T17 = VADD(T11, T14);
Chris@10 349 T18 = VFNMS(LDK(KP250000000), T17, T16);
Chris@10 350 }
Chris@10 351 ST(&(x[0]), VADD(T8, T9), ms, &(x[0]));
Chris@10 352 ST(&(x[WS(rs, 4)]), VADD(T1q, T1r), ms, &(x[0]));
Chris@10 353 ST(&(x[WS(rs, 2)]), VADD(TM, TN), ms, &(x[0]));
Chris@10 354 ST(&(x[WS(rs, 3)]), VADD(T16, T17), ms, &(x[WS(rs, 1)]));
Chris@10 355 ST(&(x[WS(rs, 1)]), VADD(Ts, Tt), ms, &(x[WS(rs, 1)]));
Chris@10 356 {
Chris@10 357 V Tj, Tk, Ti, T1B, T1C, T1A;
Chris@10 358 Ti = VSUB(Ta, T7);
Chris@10 359 Tj = BYTWJ(&(W[TWVL * 2]), VADD(Th, Ti));
Chris@10 360 Tk = BYTWJ(&(W[TWVL * 4]), VSUB(Ti, Th));
Chris@10 361 ST(&(x[WS(vs, 2)]), Tj, ms, &(x[WS(vs, 2)]));
Chris@10 362 ST(&(x[WS(vs, 3)]), Tk, ms, &(x[WS(vs, 3)]));
Chris@10 363 T1A = VSUB(T1s, T1p);
Chris@10 364 T1B = BYTWJ(&(W[TWVL * 2]), VADD(T1z, T1A));
Chris@10 365 T1C = BYTWJ(&(W[TWVL * 4]), VSUB(T1A, T1z));
Chris@10 366 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T1B, ms, &(x[WS(vs, 2)]));
Chris@10 367 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T1C, ms, &(x[WS(vs, 3)]));
Chris@10 368 }
Chris@10 369 {
Chris@10 370 V T1h, T1i, T1g, TD, TE, TC;
Chris@10 371 T1g = VSUB(T18, T15);
Chris@10 372 T1h = BYTWJ(&(W[TWVL * 2]), VADD(T1f, T1g));
Chris@10 373 T1i = BYTWJ(&(W[TWVL * 4]), VSUB(T1g, T1f));
Chris@10 374 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T1h, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 375 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1i, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 376 TC = VSUB(Tu, Tr);
Chris@10 377 TD = BYTWJ(&(W[TWVL * 2]), VADD(TB, TC));
Chris@10 378 TE = BYTWJ(&(W[TWVL * 4]), VSUB(TC, TB));
Chris@10 379 ST(&(x[WS(vs, 2) + WS(rs, 1)]), TD, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 380 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 381 }
Chris@10 382 {
Chris@10 383 V TX, TY, TW, TT, TU, TP;
Chris@10 384 TW = VSUB(TO, TL);
Chris@10 385 TX = BYTWJ(&(W[TWVL * 2]), VADD(TV, TW));
Chris@10 386 TY = BYTWJ(&(W[TWVL * 4]), VSUB(TW, TV));
Chris@10 387 ST(&(x[WS(vs, 2) + WS(rs, 2)]), TX, ms, &(x[WS(vs, 2)]));
Chris@10 388 ST(&(x[WS(vs, 3) + WS(rs, 2)]), TY, ms, &(x[WS(vs, 3)]));
Chris@10 389 TP = VADD(TL, TO);
Chris@10 390 TT = BYTWJ(&(W[0]), VSUB(TP, TS));
Chris@10 391 TU = BYTWJ(&(W[TWVL * 6]), VADD(TS, TP));
Chris@10 392 ST(&(x[WS(vs, 1) + WS(rs, 2)]), TT, ms, &(x[WS(vs, 1)]));
Chris@10 393 ST(&(x[WS(vs, 4) + WS(rs, 2)]), TU, ms, &(x[WS(vs, 4)]));
Chris@10 394 }
Chris@10 395 {
Chris@10 396 V Tf, Tg, Tb, Tz, TA, Tv;
Chris@10 397 Tb = VADD(T7, Ta);
Chris@10 398 Tf = BYTWJ(&(W[0]), VSUB(Tb, Te));
Chris@10 399 Tg = BYTWJ(&(W[TWVL * 6]), VADD(Te, Tb));
Chris@10 400 ST(&(x[WS(vs, 1)]), Tf, ms, &(x[WS(vs, 1)]));
Chris@10 401 ST(&(x[WS(vs, 4)]), Tg, ms, &(x[WS(vs, 4)]));
Chris@10 402 Tv = VADD(Tr, Tu);
Chris@10 403 Tz = BYTWJ(&(W[0]), VSUB(Tv, Ty));
Chris@10 404 TA = BYTWJ(&(W[TWVL * 6]), VADD(Ty, Tv));
Chris@10 405 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tz, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 406 ST(&(x[WS(vs, 4) + WS(rs, 1)]), TA, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 407 }
Chris@10 408 {
Chris@10 409 V T1d, T1e, T19, T1x, T1y, T1t;
Chris@10 410 T19 = VADD(T15, T18);
Chris@10 411 T1d = BYTWJ(&(W[0]), VSUB(T19, T1c));
Chris@10 412 T1e = BYTWJ(&(W[TWVL * 6]), VADD(T1c, T19));
Chris@10 413 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1d, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 414 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T1e, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 415 T1t = VADD(T1p, T1s);
Chris@10 416 T1x = BYTWJ(&(W[0]), VSUB(T1t, T1w));
Chris@10 417 T1y = BYTWJ(&(W[TWVL * 6]), VADD(T1w, T1t));
Chris@10 418 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T1x, ms, &(x[WS(vs, 1)]));
Chris@10 419 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T1y, ms, &(x[WS(vs, 4)]));
Chris@10 420 }
Chris@10 421 }
Chris@10 422 }
Chris@10 423 VLEAVE();
Chris@10 424 }
Chris@10 425
Chris@10 426 static const tw_instr twinstr[] = {
Chris@10 427 VTW(0, 1),
Chris@10 428 VTW(0, 2),
Chris@10 429 VTW(0, 3),
Chris@10 430 VTW(0, 4),
Chris@10 431 {TW_NEXT, VL, 0}
Chris@10 432 };
Chris@10 433
Chris@10 434 static const ct_desc desc = { 5, XSIMD_STRING("q1fv_5"), twinstr, &GENUS, {85, 55, 15, 0}, 0, 0, 0 };
Chris@10 435
Chris@10 436 void XSIMD(codelet_q1fv_5) (planner *p) {
Chris@10 437 X(kdft_difsq_register) (p, q1fv_5, &desc);
Chris@10 438 }
Chris@10 439 #endif /* HAVE_FMA */