annotate src/fftw-3.3.5/rdft/simd/common/hc2cbdftv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:52:46 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include hc2cbv.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 143 FP additions, 108 FP multiplications,
Chris@42 32 * (or, 77 additions, 42 multiplications, 66 fused multiply/add),
Chris@42 33 * 134 stack variables, 4 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cbv.h"
Chris@42 36
Chris@42 37 static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 41 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 46 V T1M, T1T, T4, TF, T12, Te, T16, Ts, Tb, TN, TA, TG, TU, T1Y, T11;
Chris@42 47 V T1e, T29, T21, T15, Th, T13, Tp;
Chris@42 48 {
Chris@42 49 V TS, TT, Tf, T10, T20, T1Z, TX, Tg, Tn, To, T2, T3, TD, TE, T8;
Chris@42 50 V TV, T7, TZ, Tz, T9, Tu, Tv, T5, T6, Tx, Ty, Tc, Td, Tq, Tr;
Chris@42 51 V TY, Ta, TW, Tw;
Chris@42 52 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@42 53 T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 54 TD = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 55 TE = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@42 56 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@42 57 T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 58 Tx = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 59 Ty = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
Chris@42 60 T8 = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@42 61 TS = VFMACONJ(T3, T2);
Chris@42 62 T4 = VFNMSCONJ(T3, T2);
Chris@42 63 TT = VFMACONJ(TE, TD);
Chris@42 64 TF = VFNMSCONJ(TE, TD);
Chris@42 65 TV = VFMACONJ(T6, T5);
Chris@42 66 T7 = VFNMSCONJ(T6, T5);
Chris@42 67 TZ = VFMACONJ(Ty, Tx);
Chris@42 68 Tz = VFNMSCONJ(Ty, Tx);
Chris@42 69 T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 70 Tu = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 71 Tv = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@42 72 Tc = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
Chris@42 73 Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 74 Tq = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 75 Tr = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@42 76 Tf = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@42 77 TY = VFMACONJ(T9, T8);
Chris@42 78 Ta = VFMSCONJ(T9, T8);
Chris@42 79 TW = VFMACONJ(Tv, Tu);
Chris@42 80 Tw = VFNMSCONJ(Tv, Tu);
Chris@42 81 T12 = VFMACONJ(Td, Tc);
Chris@42 82 Te = VFNMSCONJ(Td, Tc);
Chris@42 83 T16 = VFMACONJ(Tr, Tq);
Chris@42 84 Ts = VFMSCONJ(Tr, Tq);
Chris@42 85 T10 = VSUB(TY, TZ);
Chris@42 86 T20 = VADD(TY, TZ);
Chris@42 87 Tb = VADD(T7, Ta);
Chris@42 88 TN = VSUB(T7, Ta);
Chris@42 89 T1Z = VADD(TV, TW);
Chris@42 90 TX = VSUB(TV, TW);
Chris@42 91 TA = VSUB(Tw, Tz);
Chris@42 92 TG = VADD(Tw, Tz);
Chris@42 93 Tg = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 94 Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 95 To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@42 96 TU = VSUB(TS, TT);
Chris@42 97 T1Y = VADD(TS, TT);
Chris@42 98 T11 = VADD(TX, T10);
Chris@42 99 T1e = VSUB(TX, T10);
Chris@42 100 T29 = VSUB(T1Z, T20);
Chris@42 101 T21 = VADD(T1Z, T20);
Chris@42 102 T15 = VFMACONJ(Tg, Tf);
Chris@42 103 Th = VFMSCONJ(Tg, Tf);
Chris@42 104 T13 = VFMACONJ(To, Tn);
Chris@42 105 Tp = VFMSCONJ(To, Tn);
Chris@42 106 }
Chris@42 107 {
Chris@42 108 V T1S, T2B, T1W, T1I, T2q, T2w, T2i, T2c, T1C, T1K, T1s, T1g, T1, T2t, T1v;
Chris@42 109 V T1Q, T2A, T1q, T2m, TC, T1w, TP, T1x, T2f, T2r, T2g, T1E, T1D, T2y, T2x;
Chris@42 110 V T1i, T1h, T2D, T2C, T2s, T1t, T1u, T1y, T2u, TQ, T2d, T2e, T1U, T1L, T2j;
Chris@42 111 V T2k;
Chris@42 112 {
Chris@42 113 V T1R, T1F, T1V, T1o, TO, Tl, T1d, T2a, T1l, TB, TK, T1G, Tk, T1b, T19;
Chris@42 114 V T27, T25, T1H, TJ, T17, T23, TM, Ti, T14, T22, Tt, TH, Tj, T18, T24;
Chris@42 115 V TI, T2b, T2p, T1X, T2v, T2h, T2n, T1B, T1f, T28, T2o, T1a, TR, T1J, T1r;
Chris@42 116 V T1z, T26, Tm, TL, T1O, T1m, T1j, T2z, T1N, T1p, T1P, T2l, T1c, T1A, T1n;
Chris@42 117 V T1k;
Chris@42 118 T1R = LDW(&(W[TWVL * 18]));
Chris@42 119 T17 = VSUB(T15, T16);
Chris@42 120 T23 = VADD(T15, T16);
Chris@42 121 TM = VSUB(Te, Th);
Chris@42 122 Ti = VADD(Te, Th);
Chris@42 123 T14 = VSUB(T12, T13);
Chris@42 124 T22 = VADD(T12, T13);
Chris@42 125 Tt = VSUB(Tp, Ts);
Chris@42 126 TH = VADD(Tp, Ts);
Chris@42 127 T1F = LDW(&(W[TWVL * 28]));
Chris@42 128 T1V = LDW(&(W[TWVL * 8]));
Chris@42 129 T1o = VFMA(LDK(KP618033988), TM, TN);
Chris@42 130 TO = VFNMS(LDK(KP618033988), TN, TM);
Chris@42 131 Tj = VADD(Tb, Ti);
Chris@42 132 Tl = VSUB(Tb, Ti);
Chris@42 133 T18 = VADD(T14, T17);
Chris@42 134 T1d = VSUB(T14, T17);
Chris@42 135 T24 = VADD(T22, T23);
Chris@42 136 T2a = VSUB(T22, T23);
Chris@42 137 T1l = VFMA(LDK(KP618033988), Tt, TA);
Chris@42 138 TB = VFNMS(LDK(KP618033988), TA, Tt);
Chris@42 139 TI = VADD(TG, TH);
Chris@42 140 TK = VSUB(TG, TH);
Chris@42 141 T1G = VADD(T4, Tj);
Chris@42 142 Tk = VFNMS(LDK(KP250000000), Tj, T4);
Chris@42 143 T1b = VSUB(T11, T18);
Chris@42 144 T19 = VADD(T11, T18);
Chris@42 145 T27 = VSUB(T21, T24);
Chris@42 146 T25 = VADD(T21, T24);
Chris@42 147 T1H = VADD(TF, TI);
Chris@42 148 TJ = VFNMS(LDK(KP250000000), TI, TF);
Chris@42 149 T2b = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2a, T29));
Chris@42 150 T2p = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T29, T2a));
Chris@42 151 T1X = LDW(&(W[TWVL * 6]));
Chris@42 152 T1S = VZMUL(T1R, VADD(TU, T19));
Chris@42 153 T2v = LDW(&(W[TWVL * 22]));
Chris@42 154 T2B = VADD(T1Y, T25);
Chris@42 155 T26 = VFNMS(LDK(KP250000000), T25, T1Y);
Chris@42 156 T1W = VZMULI(T1V, VFMAI(T1H, T1G));
Chris@42 157 T1I = VZMULI(T1F, VFNMSI(T1H, T1G));
Chris@42 158 T2h = LDW(&(W[TWVL * 30]));
Chris@42 159 T2n = LDW(&(W[TWVL * 14]));
Chris@42 160 T1B = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1d, T1e));
Chris@42 161 T1f = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1e, T1d));
Chris@42 162 T28 = VFMA(LDK(KP559016994), T27, T26);
Chris@42 163 T2o = VFNMS(LDK(KP559016994), T27, T26);
Chris@42 164 T1a = VFNMS(LDK(KP250000000), T19, TU);
Chris@42 165 TR = LDW(&(W[TWVL * 2]));
Chris@42 166 T1J = LDW(&(W[TWVL * 26]));
Chris@42 167 T1r = LDW(&(W[TWVL * 34]));
Chris@42 168 T1z = LDW(&(W[TWVL * 10]));
Chris@42 169 T1k = VFMA(LDK(KP559016994), Tl, Tk);
Chris@42 170 Tm = VFNMS(LDK(KP559016994), Tl, Tk);
Chris@42 171 T2q = VZMUL(T2n, VFMAI(T2p, T2o));
Chris@42 172 T2w = VZMUL(T2v, VFNMSI(T2p, T2o));
Chris@42 173 T2i = VZMUL(T2h, VFMAI(T2b, T28));
Chris@42 174 T2c = VZMUL(T1X, VFNMSI(T2b, T28));
Chris@42 175 T1c = VFNMS(LDK(KP559016994), T1b, T1a);
Chris@42 176 T1A = VFMA(LDK(KP559016994), T1b, T1a);
Chris@42 177 TL = VFNMS(LDK(KP559016994), TK, TJ);
Chris@42 178 T1n = VFMA(LDK(KP559016994), TK, TJ);
Chris@42 179 T1O = VFMA(LDK(KP951056516), T1l, T1k);
Chris@42 180 T1m = VFNMS(LDK(KP951056516), T1l, T1k);
Chris@42 181 T1j = LDW(&(W[TWVL * 36]));
Chris@42 182 T2z = LDW(&(W[0]));
Chris@42 183 T1N = LDW(&(W[TWVL * 20]));
Chris@42 184 T1C = VZMUL(T1z, VFMAI(T1B, T1A));
Chris@42 185 T1K = VZMUL(T1J, VFNMSI(T1B, T1A));
Chris@42 186 T1s = VZMUL(T1r, VFMAI(T1f, T1c));
Chris@42 187 T1g = VZMUL(TR, VFNMSI(T1f, T1c));
Chris@42 188 T1p = VFMA(LDK(KP951056516), T1o, T1n);
Chris@42 189 T1P = VFNMS(LDK(KP951056516), T1o, T1n);
Chris@42 190 T2l = LDW(&(W[TWVL * 16]));
Chris@42 191 T1 = LDW(&(W[TWVL * 4]));
Chris@42 192 T2t = LDW(&(W[TWVL * 24]));
Chris@42 193 T1v = LDW(&(W[TWVL * 12]));
Chris@42 194 T1Q = VZMULI(T1N, VFNMSI(T1P, T1O));
Chris@42 195 T2A = VZMULI(T2z, VFMAI(T1p, T1m));
Chris@42 196 T1q = VZMULI(T1j, VFNMSI(T1p, T1m));
Chris@42 197 T2m = VZMULI(T2l, VFMAI(T1P, T1O));
Chris@42 198 TC = VFMA(LDK(KP951056516), TB, Tm);
Chris@42 199 T1w = VFNMS(LDK(KP951056516), TB, Tm);
Chris@42 200 TP = VFNMS(LDK(KP951056516), TO, TL);
Chris@42 201 T1x = VFMA(LDK(KP951056516), TO, TL);
Chris@42 202 T2f = LDW(&(W[TWVL * 32]));
Chris@42 203 }
Chris@42 204 T2D = VCONJ(VSUB(T2B, T2A));
Chris@42 205 T2C = VADD(T2A, T2B);
Chris@42 206 T2s = VCONJ(VSUB(T2q, T2m));
Chris@42 207 T2r = VADD(T2m, T2q);
Chris@42 208 T1t = VADD(T1q, T1s);
Chris@42 209 T1u = VCONJ(VSUB(T1s, T1q));
Chris@42 210 T1y = VZMULI(T1v, VFNMSI(T1x, T1w));
Chris@42 211 T2u = VZMULI(T2t, VFMAI(T1x, T1w));
Chris@42 212 TQ = VZMULI(T1, VFNMSI(TP, TC));
Chris@42 213 T2g = VZMULI(T2f, VFMAI(TP, TC));
Chris@42 214 ST(&(Rm[0]), T2D, -ms, &(Rm[0]));
Chris@42 215 ST(&(Rp[0]), T2C, ms, &(Rp[0]));
Chris@42 216 ST(&(Rm[WS(rs, 4)]), T2s, -ms, &(Rm[0]));
Chris@42 217 ST(&(Rm[WS(rs, 9)]), T1u, -ms, &(Rm[WS(rs, 1)]));
Chris@42 218 T1E = VCONJ(VSUB(T1C, T1y));
Chris@42 219 T1D = VADD(T1y, T1C);
Chris@42 220 T2y = VCONJ(VSUB(T2w, T2u));
Chris@42 221 T2x = VADD(T2u, T2w);
Chris@42 222 T1i = VCONJ(VSUB(T1g, TQ));
Chris@42 223 T1h = VADD(TQ, T1g);
Chris@42 224 ST(&(Rp[WS(rs, 9)]), T1t, ms, &(Rp[WS(rs, 1)]));
Chris@42 225 T1L = VADD(T1I, T1K);
Chris@42 226 T1M = VCONJ(VSUB(T1K, T1I));
Chris@42 227 ST(&(Rp[WS(rs, 3)]), T1D, ms, &(Rp[WS(rs, 1)]));
Chris@42 228 ST(&(Rm[WS(rs, 6)]), T2y, -ms, &(Rm[0]));
Chris@42 229 ST(&(Rp[WS(rs, 6)]), T2x, ms, &(Rp[0]));
Chris@42 230 ST(&(Rm[WS(rs, 1)]), T1i, -ms, &(Rm[WS(rs, 1)]));
Chris@42 231 ST(&(Rp[WS(rs, 1)]), T1h, ms, &(Rp[WS(rs, 1)]));
Chris@42 232 T2d = VADD(T1W, T2c);
Chris@42 233 T2e = VCONJ(VSUB(T2c, T1W));
Chris@42 234 ST(&(Rm[WS(rs, 3)]), T1E, -ms, &(Rm[WS(rs, 1)]));
Chris@42 235 ST(&(Rp[WS(rs, 7)]), T1L, ms, &(Rp[WS(rs, 1)]));
Chris@42 236 T1U = VCONJ(VSUB(T1S, T1Q));
Chris@42 237 T1T = VADD(T1Q, T1S);
Chris@42 238 T2j = VADD(T2g, T2i);
Chris@42 239 T2k = VCONJ(VSUB(T2i, T2g));
Chris@42 240 ST(&(Rp[WS(rs, 2)]), T2d, ms, &(Rp[0]));
Chris@42 241 ST(&(Rp[WS(rs, 4)]), T2r, ms, &(Rp[0]));
Chris@42 242 ST(&(Rm[WS(rs, 5)]), T1U, -ms, &(Rm[WS(rs, 1)]));
Chris@42 243 ST(&(Rm[WS(rs, 2)]), T2e, -ms, &(Rm[0]));
Chris@42 244 ST(&(Rp[WS(rs, 8)]), T2j, ms, &(Rp[0]));
Chris@42 245 ST(&(Rm[WS(rs, 8)]), T2k, -ms, &(Rm[0]));
Chris@42 246 }
Chris@42 247 ST(&(Rp[WS(rs, 5)]), T1T, ms, &(Rp[WS(rs, 1)]));
Chris@42 248 ST(&(Rm[WS(rs, 7)]), T1M, -ms, &(Rm[WS(rs, 1)]));
Chris@42 249 }
Chris@42 250 }
Chris@42 251 VLEAVE();
Chris@42 252 }
Chris@42 253
Chris@42 254 static const tw_instr twinstr[] = {
Chris@42 255 VTW(1, 1),
Chris@42 256 VTW(1, 2),
Chris@42 257 VTW(1, 3),
Chris@42 258 VTW(1, 4),
Chris@42 259 VTW(1, 5),
Chris@42 260 VTW(1, 6),
Chris@42 261 VTW(1, 7),
Chris@42 262 VTW(1, 8),
Chris@42 263 VTW(1, 9),
Chris@42 264 VTW(1, 10),
Chris@42 265 VTW(1, 11),
Chris@42 266 VTW(1, 12),
Chris@42 267 VTW(1, 13),
Chris@42 268 VTW(1, 14),
Chris@42 269 VTW(1, 15),
Chris@42 270 VTW(1, 16),
Chris@42 271 VTW(1, 17),
Chris@42 272 VTW(1, 18),
Chris@42 273 VTW(1, 19),
Chris@42 274 {TW_NEXT, VL, 0}
Chris@42 275 };
Chris@42 276
Chris@42 277 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, {77, 42, 66, 0} };
Chris@42 278
Chris@42 279 void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
Chris@42 280 X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
Chris@42 281 }
Chris@42 282 #else /* HAVE_FMA */
Chris@42 283
Chris@42 284 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include hc2cbv.h */
Chris@42 285
Chris@42 286 /*
Chris@42 287 * This function contains 143 FP additions, 62 FP multiplications,
Chris@42 288 * (or, 131 additions, 50 multiplications, 12 fused multiply/add),
Chris@42 289 * 114 stack variables, 4 constants, and 40 memory accesses
Chris@42 290 */
Chris@42 291 #include "hc2cbv.h"
Chris@42 292
Chris@42 293 static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 294 {
Chris@42 295 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 296 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 297 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 298 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 299 {
Chris@42 300 INT m;
Chris@42 301 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@42 302 V TK, T1v, TY, T1x, T1j, T2f, TS, TT, TO, TU, T5, To, Tp, Tq, T2a;
Chris@42 303 V T2d, T2g, T2k, T2j, T1k, T1l, T18, T1m, T1f;
Chris@42 304 {
Chris@42 305 V T2, TP, T4, TR, TI, T1d, T9, T12, Td, T15, TE, T1a, Tv, T13, Tm;
Chris@42 306 V T1c, Tz, T16, Ti, T19, T3, TQ, TH, TG, TF, T6, T8, T7, Tc, Tb;
Chris@42 307 V Ta, TD, TC, TB, Ts, Tu, Tt, Tl, Tk, Tj, Tw, Ty, Tx, Tf, Th;
Chris@42 308 V Tg, TA, TJ, TW, TX, T1h, T1i, TM, TN, Te, Tn, T28, T29, T2b, T2c;
Chris@42 309 V T14, T17, T1b, T1e;
Chris@42 310 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@42 311 TP = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 312 T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 313 T4 = VCONJ(T3);
Chris@42 314 TQ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@42 315 TR = VCONJ(TQ);
Chris@42 316 TH = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 317 TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@42 318 TG = VCONJ(TF);
Chris@42 319 TI = VSUB(TG, TH);
Chris@42 320 T1d = VADD(TG, TH);
Chris@42 321 T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@42 322 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 323 T8 = VCONJ(T7);
Chris@42 324 T9 = VSUB(T6, T8);
Chris@42 325 T12 = VADD(T6, T8);
Chris@42 326 Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@42 327 Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 328 Tb = VCONJ(Ta);
Chris@42 329 Td = VSUB(Tb, Tc);
Chris@42 330 T15 = VADD(Tb, Tc);
Chris@42 331 TD = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 332 TB = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@42 333 TC = VCONJ(TB);
Chris@42 334 TE = VSUB(TC, TD);
Chris@42 335 T1a = VADD(TC, TD);
Chris@42 336 Ts = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 337 Tt = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@42 338 Tu = VCONJ(Tt);
Chris@42 339 Tv = VSUB(Ts, Tu);
Chris@42 340 T13 = VADD(Ts, Tu);
Chris@42 341 Tl = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@42 342 Tj = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 343 Tk = VCONJ(Tj);
Chris@42 344 Tm = VSUB(Tk, Tl);
Chris@42 345 T1c = VADD(Tk, Tl);
Chris@42 346 Tw = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 347 Tx = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
Chris@42 348 Ty = VCONJ(Tx);
Chris@42 349 Tz = VSUB(Tw, Ty);
Chris@42 350 T16 = VADD(Tw, Ty);
Chris@42 351 Tf = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
Chris@42 352 Tg = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 353 Th = VCONJ(Tg);
Chris@42 354 Ti = VSUB(Tf, Th);
Chris@42 355 T19 = VADD(Tf, Th);
Chris@42 356 TA = VSUB(Tv, Tz);
Chris@42 357 TJ = VSUB(TE, TI);
Chris@42 358 TK = VFNMS(LDK(KP951056516), TJ, VMUL(LDK(KP587785252), TA));
Chris@42 359 T1v = VFMA(LDK(KP951056516), TA, VMUL(LDK(KP587785252), TJ));
Chris@42 360 TW = VSUB(T9, Td);
Chris@42 361 TX = VSUB(Ti, Tm);
Chris@42 362 TY = VFNMS(LDK(KP951056516), TX, VMUL(LDK(KP587785252), TW));
Chris@42 363 T1x = VFMA(LDK(KP951056516), TW, VMUL(LDK(KP587785252), TX));
Chris@42 364 T1h = VADD(T2, T4);
Chris@42 365 T1i = VADD(TP, TR);
Chris@42 366 T1j = VSUB(T1h, T1i);
Chris@42 367 T2f = VADD(T1h, T1i);
Chris@42 368 TS = VSUB(TP, TR);
Chris@42 369 TM = VADD(Tv, Tz);
Chris@42 370 TN = VADD(TE, TI);
Chris@42 371 TT = VADD(TM, TN);
Chris@42 372 TO = VMUL(LDK(KP559016994), VSUB(TM, TN));
Chris@42 373 TU = VFNMS(LDK(KP250000000), TT, TS);
Chris@42 374 T5 = VSUB(T2, T4);
Chris@42 375 Te = VADD(T9, Td);
Chris@42 376 Tn = VADD(Ti, Tm);
Chris@42 377 To = VADD(Te, Tn);
Chris@42 378 Tp = VFNMS(LDK(KP250000000), To, T5);
Chris@42 379 Tq = VMUL(LDK(KP559016994), VSUB(Te, Tn));
Chris@42 380 T28 = VADD(T12, T13);
Chris@42 381 T29 = VADD(T15, T16);
Chris@42 382 T2a = VADD(T28, T29);
Chris@42 383 T2b = VADD(T19, T1a);
Chris@42 384 T2c = VADD(T1c, T1d);
Chris@42 385 T2d = VADD(T2b, T2c);
Chris@42 386 T2g = VADD(T2a, T2d);
Chris@42 387 T2k = VSUB(T2b, T2c);
Chris@42 388 T2j = VSUB(T28, T29);
Chris@42 389 T14 = VSUB(T12, T13);
Chris@42 390 T17 = VSUB(T15, T16);
Chris@42 391 T1k = VADD(T14, T17);
Chris@42 392 T1b = VSUB(T19, T1a);
Chris@42 393 T1e = VSUB(T1c, T1d);
Chris@42 394 T1l = VADD(T1b, T1e);
Chris@42 395 T18 = VSUB(T14, T17);
Chris@42 396 T1m = VADD(T1k, T1l);
Chris@42 397 T1f = VSUB(T1b, T1e);
Chris@42 398 }
Chris@42 399 {
Chris@42 400 V T2L, T22, T1S, T26, T2m, T2G, T2s, T2A, T1q, T1U, T1C, T1M, T10, T2E, T1I;
Chris@42 401 V T2q, T1A, T2K, T20, T2w, T21, T1Q, T1R, T1P, T25, T1r, T1s, T2C, T2N, T1N;
Chris@42 402 V T2H, T2I, T2M, T1E, T1D, T1O, T1V, T2n, T2B, T24, T2o, T2t, T2u, T23, T1W;
Chris@42 403 T2L = VADD(T2f, T2g);
Chris@42 404 T21 = LDW(&(W[TWVL * 18]));
Chris@42 405 T22 = VZMUL(T21, VADD(T1j, T1m));
Chris@42 406 T1Q = VADD(T5, To);
Chris@42 407 T1R = VBYI(VADD(TS, TT));
Chris@42 408 T1P = LDW(&(W[TWVL * 28]));
Chris@42 409 T1S = VZMULI(T1P, VSUB(T1Q, T1R));
Chris@42 410 T25 = LDW(&(W[TWVL * 8]));
Chris@42 411 T26 = VZMULI(T25, VADD(T1Q, T1R));
Chris@42 412 {
Chris@42 413 V T2l, T2z, T2i, T2y, T2e, T2h, T27, T2F, T2r, T2x, T1g, T1K, T1p, T1L, T1n;
Chris@42 414 V T1o, T11, T1T, T1B, T1J, TL, T1G, TZ, T1H, Tr, TV, T1, T2D, T1F, T2p;
Chris@42 415 V T1w, T1Y, T1z, T1Z, T1u, T1y, T1t, T2J, T1X, T2v;
Chris@42 416 T2l = VBYI(VFMA(LDK(KP951056516), T2j, VMUL(LDK(KP587785252), T2k)));
Chris@42 417 T2z = VBYI(VFNMS(LDK(KP951056516), T2k, VMUL(LDK(KP587785252), T2j)));
Chris@42 418 T2e = VMUL(LDK(KP559016994), VSUB(T2a, T2d));
Chris@42 419 T2h = VFNMS(LDK(KP250000000), T2g, T2f);
Chris@42 420 T2i = VADD(T2e, T2h);
Chris@42 421 T2y = VSUB(T2h, T2e);
Chris@42 422 T27 = LDW(&(W[TWVL * 6]));
Chris@42 423 T2m = VZMUL(T27, VSUB(T2i, T2l));
Chris@42 424 T2F = LDW(&(W[TWVL * 22]));
Chris@42 425 T2G = VZMUL(T2F, VADD(T2z, T2y));
Chris@42 426 T2r = LDW(&(W[TWVL * 30]));
Chris@42 427 T2s = VZMUL(T2r, VADD(T2l, T2i));
Chris@42 428 T2x = LDW(&(W[TWVL * 14]));
Chris@42 429 T2A = VZMUL(T2x, VSUB(T2y, T2z));
Chris@42 430 T1g = VBYI(VFNMS(LDK(KP951056516), T1f, VMUL(LDK(KP587785252), T18)));
Chris@42 431 T1K = VBYI(VFMA(LDK(KP951056516), T18, VMUL(LDK(KP587785252), T1f)));
Chris@42 432 T1n = VFNMS(LDK(KP250000000), T1m, T1j);
Chris@42 433 T1o = VMUL(LDK(KP559016994), VSUB(T1k, T1l));
Chris@42 434 T1p = VSUB(T1n, T1o);
Chris@42 435 T1L = VADD(T1o, T1n);
Chris@42 436 T11 = LDW(&(W[TWVL * 2]));
Chris@42 437 T1q = VZMUL(T11, VADD(T1g, T1p));
Chris@42 438 T1T = LDW(&(W[TWVL * 26]));
Chris@42 439 T1U = VZMUL(T1T, VSUB(T1L, T1K));
Chris@42 440 T1B = LDW(&(W[TWVL * 34]));
Chris@42 441 T1C = VZMUL(T1B, VSUB(T1p, T1g));
Chris@42 442 T1J = LDW(&(W[TWVL * 10]));
Chris@42 443 T1M = VZMUL(T1J, VADD(T1K, T1L));
Chris@42 444 Tr = VSUB(Tp, Tq);
Chris@42 445 TL = VSUB(Tr, TK);
Chris@42 446 T1G = VADD(Tr, TK);
Chris@42 447 TV = VSUB(TO, TU);
Chris@42 448 TZ = VBYI(VSUB(TV, TY));
Chris@42 449 T1H = VBYI(VADD(TY, TV));
Chris@42 450 T1 = LDW(&(W[TWVL * 4]));
Chris@42 451 T10 = VZMULI(T1, VADD(TL, TZ));
Chris@42 452 T2D = LDW(&(W[TWVL * 24]));
Chris@42 453 T2E = VZMULI(T2D, VSUB(T1G, T1H));
Chris@42 454 T1F = LDW(&(W[TWVL * 12]));
Chris@42 455 T1I = VZMULI(T1F, VADD(T1G, T1H));
Chris@42 456 T2p = LDW(&(W[TWVL * 32]));
Chris@42 457 T2q = VZMULI(T2p, VSUB(TL, TZ));
Chris@42 458 T1u = VADD(Tq, Tp);
Chris@42 459 T1w = VSUB(T1u, T1v);
Chris@42 460 T1Y = VADD(T1u, T1v);
Chris@42 461 T1y = VADD(TO, TU);
Chris@42 462 T1z = VBYI(VADD(T1x, T1y));
Chris@42 463 T1Z = VBYI(VSUB(T1y, T1x));
Chris@42 464 T1t = LDW(&(W[TWVL * 36]));
Chris@42 465 T1A = VZMULI(T1t, VSUB(T1w, T1z));
Chris@42 466 T2J = LDW(&(W[0]));
Chris@42 467 T2K = VZMULI(T2J, VADD(T1w, T1z));
Chris@42 468 T1X = LDW(&(W[TWVL * 20]));
Chris@42 469 T20 = VZMULI(T1X, VSUB(T1Y, T1Z));
Chris@42 470 T2v = LDW(&(W[TWVL * 16]));
Chris@42 471 T2w = VZMULI(T2v, VADD(T1Y, T1Z));
Chris@42 472 }
Chris@42 473 T1r = VADD(T10, T1q);
Chris@42 474 ST(&(Rp[WS(rs, 1)]), T1r, ms, &(Rp[WS(rs, 1)]));
Chris@42 475 T1s = VCONJ(VSUB(T1q, T10));
Chris@42 476 ST(&(Rm[WS(rs, 1)]), T1s, -ms, &(Rm[WS(rs, 1)]));
Chris@42 477 T2C = VCONJ(VSUB(T2A, T2w));
Chris@42 478 ST(&(Rm[WS(rs, 4)]), T2C, -ms, &(Rm[0]));
Chris@42 479 T2N = VCONJ(VSUB(T2L, T2K));
Chris@42 480 ST(&(Rm[0]), T2N, -ms, &(Rm[0]));
Chris@42 481 T1N = VADD(T1I, T1M);
Chris@42 482 ST(&(Rp[WS(rs, 3)]), T1N, ms, &(Rp[WS(rs, 1)]));
Chris@42 483 T2H = VADD(T2E, T2G);
Chris@42 484 ST(&(Rp[WS(rs, 6)]), T2H, ms, &(Rp[0]));
Chris@42 485 T2I = VCONJ(VSUB(T2G, T2E));
Chris@42 486 ST(&(Rm[WS(rs, 6)]), T2I, -ms, &(Rm[0]));
Chris@42 487 T2M = VADD(T2K, T2L);
Chris@42 488 ST(&(Rp[0]), T2M, ms, &(Rp[0]));
Chris@42 489 T1E = VCONJ(VSUB(T1C, T1A));
Chris@42 490 ST(&(Rm[WS(rs, 9)]), T1E, -ms, &(Rm[WS(rs, 1)]));
Chris@42 491 T1D = VADD(T1A, T1C);
Chris@42 492 ST(&(Rp[WS(rs, 9)]), T1D, ms, &(Rp[WS(rs, 1)]));
Chris@42 493 T1O = VCONJ(VSUB(T1M, T1I));
Chris@42 494 ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
Chris@42 495 T1V = VADD(T1S, T1U);
Chris@42 496 ST(&(Rp[WS(rs, 7)]), T1V, ms, &(Rp[WS(rs, 1)]));
Chris@42 497 T2n = VADD(T26, T2m);
Chris@42 498 ST(&(Rp[WS(rs, 2)]), T2n, ms, &(Rp[0]));
Chris@42 499 T2B = VADD(T2w, T2A);
Chris@42 500 ST(&(Rp[WS(rs, 4)]), T2B, ms, &(Rp[0]));
Chris@42 501 T24 = VCONJ(VSUB(T22, T20));
Chris@42 502 ST(&(Rm[WS(rs, 5)]), T24, -ms, &(Rm[WS(rs, 1)]));
Chris@42 503 T2o = VCONJ(VSUB(T2m, T26));
Chris@42 504 ST(&(Rm[WS(rs, 2)]), T2o, -ms, &(Rm[0]));
Chris@42 505 T2t = VADD(T2q, T2s);
Chris@42 506 ST(&(Rp[WS(rs, 8)]), T2t, ms, &(Rp[0]));
Chris@42 507 T2u = VCONJ(VSUB(T2s, T2q));
Chris@42 508 ST(&(Rm[WS(rs, 8)]), T2u, -ms, &(Rm[0]));
Chris@42 509 T23 = VADD(T20, T22);
Chris@42 510 ST(&(Rp[WS(rs, 5)]), T23, ms, &(Rp[WS(rs, 1)]));
Chris@42 511 T1W = VCONJ(VSUB(T1U, T1S));
Chris@42 512 ST(&(Rm[WS(rs, 7)]), T1W, -ms, &(Rm[WS(rs, 1)]));
Chris@42 513 }
Chris@42 514 }
Chris@42 515 }
Chris@42 516 VLEAVE();
Chris@42 517 }
Chris@42 518
Chris@42 519 static const tw_instr twinstr[] = {
Chris@42 520 VTW(1, 1),
Chris@42 521 VTW(1, 2),
Chris@42 522 VTW(1, 3),
Chris@42 523 VTW(1, 4),
Chris@42 524 VTW(1, 5),
Chris@42 525 VTW(1, 6),
Chris@42 526 VTW(1, 7),
Chris@42 527 VTW(1, 8),
Chris@42 528 VTW(1, 9),
Chris@42 529 VTW(1, 10),
Chris@42 530 VTW(1, 11),
Chris@42 531 VTW(1, 12),
Chris@42 532 VTW(1, 13),
Chris@42 533 VTW(1, 14),
Chris@42 534 VTW(1, 15),
Chris@42 535 VTW(1, 16),
Chris@42 536 VTW(1, 17),
Chris@42 537 VTW(1, 18),
Chris@42 538 VTW(1, 19),
Chris@42 539 {TW_NEXT, VL, 0}
Chris@42 540 };
Chris@42 541
Chris@42 542 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, {131, 50, 12, 0} };
Chris@42 543
Chris@42 544 void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
Chris@42 545 X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
Chris@42 546 }
Chris@42 547 #endif /* HAVE_FMA */