annotate src/fftw-3.3.8/rdft/simd/common/hc2cbdftv_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:08:12 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include rdft/simd/hc2cbv.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 143 FP additions, 108 FP multiplications,
Chris@82 32 * (or, 77 additions, 42 multiplications, 66 fused multiply/add),
Chris@82 33 * 110 stack variables, 4 constants, and 40 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/simd/hc2cbv.h"
Chris@82 36
Chris@82 37 static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 42 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@82 46 V T4, TF, Tl, T2a, T1d, T1Y, T29, TK, TU, T1e, Tj, Tk, TI, TJ, T19;
Chris@82 47 V T1b, T25, T27, TB, T1l, TO, T1o;
Chris@82 48 {
Chris@82 49 V TS, TT, T7, Tz, Ta, Tw, Tb, TG, T20, T1Z, T10, TX, Te, Ts, Th;
Chris@82 50 V Tp, Ti, TH, T23, T22, T17, T14, T2, T3, TD, TE, TV, TZ, TY, TW;
Chris@82 51 V T5, T6, Tx, Ty, T8, T9, Tu, Tv, T12, T16, T15, T13, Tc, Td, Tq;
Chris@82 52 V Tr, Tf, Tg, Tn, To, T11, T18, T21, T24, Tt, TA, TM, TN;
Chris@82 53 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@82 54 T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 55 T4 = VFNMSCONJ(T3, T2);
Chris@82 56 TS = VFMACONJ(T3, T2);
Chris@82 57 TD = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 58 TE = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@82 59 TF = VFNMSCONJ(TE, TD);
Chris@82 60 TT = VFMACONJ(TE, TD);
Chris@82 61 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@82 62 T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 63 T7 = VFNMSCONJ(T6, T5);
Chris@82 64 TV = VFMACONJ(T6, T5);
Chris@82 65 Tx = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 66 Ty = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
Chris@82 67 Tz = VFNMSCONJ(Ty, Tx);
Chris@82 68 TZ = VFMACONJ(Ty, Tx);
Chris@82 69 T8 = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@82 70 T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 71 Ta = VFMSCONJ(T9, T8);
Chris@82 72 TY = VFMACONJ(T9, T8);
Chris@82 73 Tu = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 74 Tv = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@82 75 Tw = VFNMSCONJ(Tv, Tu);
Chris@82 76 TW = VFMACONJ(Tv, Tu);
Chris@82 77 Tb = VADD(T7, Ta);
Chris@82 78 TG = VADD(Tw, Tz);
Chris@82 79 T20 = VADD(TY, TZ);
Chris@82 80 T1Z = VADD(TV, TW);
Chris@82 81 T10 = VSUB(TY, TZ);
Chris@82 82 TX = VSUB(TV, TW);
Chris@82 83 Tc = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
Chris@82 84 Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 85 Te = VFNMSCONJ(Td, Tc);
Chris@82 86 T12 = VFMACONJ(Td, Tc);
Chris@82 87 Tq = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 88 Tr = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@82 89 Ts = VFMSCONJ(Tr, Tq);
Chris@82 90 T16 = VFMACONJ(Tr, Tq);
Chris@82 91 Tf = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@82 92 Tg = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 93 Th = VFMSCONJ(Tg, Tf);
Chris@82 94 T15 = VFMACONJ(Tg, Tf);
Chris@82 95 Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 96 To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@82 97 Tp = VFMSCONJ(To, Tn);
Chris@82 98 T13 = VFMACONJ(To, Tn);
Chris@82 99 Ti = VADD(Te, Th);
Chris@82 100 TH = VADD(Tp, Ts);
Chris@82 101 T23 = VADD(T15, T16);
Chris@82 102 T22 = VADD(T12, T13);
Chris@82 103 T17 = VSUB(T15, T16);
Chris@82 104 T14 = VSUB(T12, T13);
Chris@82 105 Tl = VSUB(Tb, Ti);
Chris@82 106 T2a = VSUB(T22, T23);
Chris@82 107 T1d = VSUB(T14, T17);
Chris@82 108 T1Y = VADD(TS, TT);
Chris@82 109 T29 = VSUB(T1Z, T20);
Chris@82 110 TK = VSUB(TG, TH);
Chris@82 111 TU = VSUB(TS, TT);
Chris@82 112 T1e = VSUB(TX, T10);
Chris@82 113 Tj = VADD(Tb, Ti);
Chris@82 114 Tk = VFNMS(LDK(KP250000000), Tj, T4);
Chris@82 115 TI = VADD(TG, TH);
Chris@82 116 TJ = VFNMS(LDK(KP250000000), TI, TF);
Chris@82 117 T11 = VADD(TX, T10);
Chris@82 118 T18 = VADD(T14, T17);
Chris@82 119 T19 = VADD(T11, T18);
Chris@82 120 T1b = VSUB(T11, T18);
Chris@82 121 T21 = VADD(T1Z, T20);
Chris@82 122 T24 = VADD(T22, T23);
Chris@82 123 T25 = VADD(T21, T24);
Chris@82 124 T27 = VSUB(T21, T24);
Chris@82 125 Tt = VSUB(Tp, Ts);
Chris@82 126 TA = VSUB(Tw, Tz);
Chris@82 127 TB = VFNMS(LDK(KP618033988), TA, Tt);
Chris@82 128 T1l = VFMA(LDK(KP618033988), Tt, TA);
Chris@82 129 TM = VSUB(Te, Th);
Chris@82 130 TN = VSUB(T7, Ta);
Chris@82 131 TO = VFNMS(LDK(KP618033988), TN, TM);
Chris@82 132 T1o = VFMA(LDK(KP618033988), TM, TN);
Chris@82 133 }
Chris@82 134 {
Chris@82 135 V T2B, T1S, T1I, T1W, T2c, T2w, T2i, T2q, T1g, T1K, T1s, T1C, T1q, T2A, T1Q;
Chris@82 136 V T2m, TQ, T2u, T1y, T2g, T1R, T1G, T1H, T1F, T1V, T1h, T1i, T2s, T2D, T1D;
Chris@82 137 V T2x, T2y, T2C, T1u, T1t, T1E, T1L, T2d, T2r, T1U, T2e, T2j, T2k, T1T, T1M;
Chris@82 138 T2B = VADD(T1Y, T25);
Chris@82 139 T1R = LDW(&(W[TWVL * 18]));
Chris@82 140 T1S = VZMUL(T1R, VADD(TU, T19));
Chris@82 141 T1G = VADD(T4, Tj);
Chris@82 142 T1H = VADD(TF, TI);
Chris@82 143 T1F = LDW(&(W[TWVL * 28]));
Chris@82 144 T1I = VZMULI(T1F, VFNMSI(T1H, T1G));
Chris@82 145 T1V = LDW(&(W[TWVL * 8]));
Chris@82 146 T1W = VZMULI(T1V, VFMAI(T1H, T1G));
Chris@82 147 {
Chris@82 148 V T2b, T2p, T28, T2o, T26, T1X, T2v, T2h, T2n, T1f, T1B, T1c, T1A, T1a, TR;
Chris@82 149 V T1J, T1r, T1z, T1m, T1O, T1p, T1P, T1k, T1n, T1j, T2z, T1N, T2l, TC, T1w;
Chris@82 150 V TP, T1x, Tm, TL, T1, T2t, T1v, T2f;
Chris@82 151 T2b = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T2a, T29));
Chris@82 152 T2p = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T29, T2a));
Chris@82 153 T26 = VFNMS(LDK(KP250000000), T25, T1Y);
Chris@82 154 T28 = VFMA(LDK(KP559016994), T27, T26);
Chris@82 155 T2o = VFNMS(LDK(KP559016994), T27, T26);
Chris@82 156 T1X = LDW(&(W[TWVL * 6]));
Chris@82 157 T2c = VZMUL(T1X, VFNMSI(T2b, T28));
Chris@82 158 T2v = LDW(&(W[TWVL * 22]));
Chris@82 159 T2w = VZMUL(T2v, VFNMSI(T2p, T2o));
Chris@82 160 T2h = LDW(&(W[TWVL * 30]));
Chris@82 161 T2i = VZMUL(T2h, VFMAI(T2b, T28));
Chris@82 162 T2n = LDW(&(W[TWVL * 14]));
Chris@82 163 T2q = VZMUL(T2n, VFMAI(T2p, T2o));
Chris@82 164 T1f = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1e, T1d));
Chris@82 165 T1B = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1d, T1e));
Chris@82 166 T1a = VFNMS(LDK(KP250000000), T19, TU);
Chris@82 167 T1c = VFNMS(LDK(KP559016994), T1b, T1a);
Chris@82 168 T1A = VFMA(LDK(KP559016994), T1b, T1a);
Chris@82 169 TR = LDW(&(W[TWVL * 2]));
Chris@82 170 T1g = VZMUL(TR, VFNMSI(T1f, T1c));
Chris@82 171 T1J = LDW(&(W[TWVL * 26]));
Chris@82 172 T1K = VZMUL(T1J, VFNMSI(T1B, T1A));
Chris@82 173 T1r = LDW(&(W[TWVL * 34]));
Chris@82 174 T1s = VZMUL(T1r, VFMAI(T1f, T1c));
Chris@82 175 T1z = LDW(&(W[TWVL * 10]));
Chris@82 176 T1C = VZMUL(T1z, VFMAI(T1B, T1A));
Chris@82 177 T1k = VFMA(LDK(KP559016994), Tl, Tk);
Chris@82 178 T1m = VFNMS(LDK(KP951056516), T1l, T1k);
Chris@82 179 T1O = VFMA(LDK(KP951056516), T1l, T1k);
Chris@82 180 T1n = VFMA(LDK(KP559016994), TK, TJ);
Chris@82 181 T1p = VFMA(LDK(KP951056516), T1o, T1n);
Chris@82 182 T1P = VFNMS(LDK(KP951056516), T1o, T1n);
Chris@82 183 T1j = LDW(&(W[TWVL * 36]));
Chris@82 184 T1q = VZMULI(T1j, VFNMSI(T1p, T1m));
Chris@82 185 T2z = LDW(&(W[0]));
Chris@82 186 T2A = VZMULI(T2z, VFMAI(T1p, T1m));
Chris@82 187 T1N = LDW(&(W[TWVL * 20]));
Chris@82 188 T1Q = VZMULI(T1N, VFNMSI(T1P, T1O));
Chris@82 189 T2l = LDW(&(W[TWVL * 16]));
Chris@82 190 T2m = VZMULI(T2l, VFMAI(T1P, T1O));
Chris@82 191 Tm = VFNMS(LDK(KP559016994), Tl, Tk);
Chris@82 192 TC = VFMA(LDK(KP951056516), TB, Tm);
Chris@82 193 T1w = VFNMS(LDK(KP951056516), TB, Tm);
Chris@82 194 TL = VFNMS(LDK(KP559016994), TK, TJ);
Chris@82 195 TP = VFNMS(LDK(KP951056516), TO, TL);
Chris@82 196 T1x = VFMA(LDK(KP951056516), TO, TL);
Chris@82 197 T1 = LDW(&(W[TWVL * 4]));
Chris@82 198 TQ = VZMULI(T1, VFNMSI(TP, TC));
Chris@82 199 T2t = LDW(&(W[TWVL * 24]));
Chris@82 200 T2u = VZMULI(T2t, VFMAI(T1x, T1w));
Chris@82 201 T1v = LDW(&(W[TWVL * 12]));
Chris@82 202 T1y = VZMULI(T1v, VFNMSI(T1x, T1w));
Chris@82 203 T2f = LDW(&(W[TWVL * 32]));
Chris@82 204 T2g = VZMULI(T2f, VFMAI(TP, TC));
Chris@82 205 }
Chris@82 206 T1h = VADD(TQ, T1g);
Chris@82 207 ST(&(Rp[WS(rs, 1)]), T1h, ms, &(Rp[WS(rs, 1)]));
Chris@82 208 T1i = VCONJ(VSUB(T1g, TQ));
Chris@82 209 ST(&(Rm[WS(rs, 1)]), T1i, -ms, &(Rm[WS(rs, 1)]));
Chris@82 210 T2s = VCONJ(VSUB(T2q, T2m));
Chris@82 211 ST(&(Rm[WS(rs, 4)]), T2s, -ms, &(Rm[0]));
Chris@82 212 T2D = VCONJ(VSUB(T2B, T2A));
Chris@82 213 ST(&(Rm[0]), T2D, -ms, &(Rm[0]));
Chris@82 214 T1D = VADD(T1y, T1C);
Chris@82 215 ST(&(Rp[WS(rs, 3)]), T1D, ms, &(Rp[WS(rs, 1)]));
Chris@82 216 T2x = VADD(T2u, T2w);
Chris@82 217 ST(&(Rp[WS(rs, 6)]), T2x, ms, &(Rp[0]));
Chris@82 218 T2y = VCONJ(VSUB(T2w, T2u));
Chris@82 219 ST(&(Rm[WS(rs, 6)]), T2y, -ms, &(Rm[0]));
Chris@82 220 T2C = VADD(T2A, T2B);
Chris@82 221 ST(&(Rp[0]), T2C, ms, &(Rp[0]));
Chris@82 222 T1u = VCONJ(VSUB(T1s, T1q));
Chris@82 223 ST(&(Rm[WS(rs, 9)]), T1u, -ms, &(Rm[WS(rs, 1)]));
Chris@82 224 T1t = VADD(T1q, T1s);
Chris@82 225 ST(&(Rp[WS(rs, 9)]), T1t, ms, &(Rp[WS(rs, 1)]));
Chris@82 226 T1E = VCONJ(VSUB(T1C, T1y));
Chris@82 227 ST(&(Rm[WS(rs, 3)]), T1E, -ms, &(Rm[WS(rs, 1)]));
Chris@82 228 T1L = VADD(T1I, T1K);
Chris@82 229 ST(&(Rp[WS(rs, 7)]), T1L, ms, &(Rp[WS(rs, 1)]));
Chris@82 230 T2d = VADD(T1W, T2c);
Chris@82 231 ST(&(Rp[WS(rs, 2)]), T2d, ms, &(Rp[0]));
Chris@82 232 T2r = VADD(T2m, T2q);
Chris@82 233 ST(&(Rp[WS(rs, 4)]), T2r, ms, &(Rp[0]));
Chris@82 234 T1U = VCONJ(VSUB(T1S, T1Q));
Chris@82 235 ST(&(Rm[WS(rs, 5)]), T1U, -ms, &(Rm[WS(rs, 1)]));
Chris@82 236 T2e = VCONJ(VSUB(T2c, T1W));
Chris@82 237 ST(&(Rm[WS(rs, 2)]), T2e, -ms, &(Rm[0]));
Chris@82 238 T2j = VADD(T2g, T2i);
Chris@82 239 ST(&(Rp[WS(rs, 8)]), T2j, ms, &(Rp[0]));
Chris@82 240 T2k = VCONJ(VSUB(T2i, T2g));
Chris@82 241 ST(&(Rm[WS(rs, 8)]), T2k, -ms, &(Rm[0]));
Chris@82 242 T1T = VADD(T1Q, T1S);
Chris@82 243 ST(&(Rp[WS(rs, 5)]), T1T, ms, &(Rp[WS(rs, 1)]));
Chris@82 244 T1M = VCONJ(VSUB(T1K, T1I));
Chris@82 245 ST(&(Rm[WS(rs, 7)]), T1M, -ms, &(Rm[WS(rs, 1)]));
Chris@82 246 }
Chris@82 247 }
Chris@82 248 }
Chris@82 249 VLEAVE();
Chris@82 250 }
Chris@82 251
Chris@82 252 static const tw_instr twinstr[] = {
Chris@82 253 VTW(1, 1),
Chris@82 254 VTW(1, 2),
Chris@82 255 VTW(1, 3),
Chris@82 256 VTW(1, 4),
Chris@82 257 VTW(1, 5),
Chris@82 258 VTW(1, 6),
Chris@82 259 VTW(1, 7),
Chris@82 260 VTW(1, 8),
Chris@82 261 VTW(1, 9),
Chris@82 262 VTW(1, 10),
Chris@82 263 VTW(1, 11),
Chris@82 264 VTW(1, 12),
Chris@82 265 VTW(1, 13),
Chris@82 266 VTW(1, 14),
Chris@82 267 VTW(1, 15),
Chris@82 268 VTW(1, 16),
Chris@82 269 VTW(1, 17),
Chris@82 270 VTW(1, 18),
Chris@82 271 VTW(1, 19),
Chris@82 272 {TW_NEXT, VL, 0}
Chris@82 273 };
Chris@82 274
Chris@82 275 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, {77, 42, 66, 0} };
Chris@82 276
Chris@82 277 void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
Chris@82 278 X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
Chris@82 279 }
Chris@82 280 #else
Chris@82 281
Chris@82 282 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dif -sign 1 -name hc2cbdftv_20 -include rdft/simd/hc2cbv.h */
Chris@82 283
Chris@82 284 /*
Chris@82 285 * This function contains 143 FP additions, 62 FP multiplications,
Chris@82 286 * (or, 131 additions, 50 multiplications, 12 fused multiply/add),
Chris@82 287 * 114 stack variables, 4 constants, and 40 memory accesses
Chris@82 288 */
Chris@82 289 #include "rdft/simd/hc2cbv.h"
Chris@82 290
Chris@82 291 static void hc2cbdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 292 {
Chris@82 293 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 294 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 295 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 296 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 297 {
Chris@82 298 INT m;
Chris@82 299 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
Chris@82 300 V TK, T1v, TY, T1x, T1j, T2f, TS, TT, TO, TU, T5, To, Tp, Tq, T2a;
Chris@82 301 V T2d, T2g, T2k, T2j, T1k, T1l, T18, T1m, T1f;
Chris@82 302 {
Chris@82 303 V T2, TP, T4, TR, TI, T1d, T9, T12, Td, T15, TE, T1a, Tv, T13, Tm;
Chris@82 304 V T1c, Tz, T16, Ti, T19, T3, TQ, TH, TG, TF, T6, T8, T7, Tc, Tb;
Chris@82 305 V Ta, TD, TC, TB, Ts, Tu, Tt, Tl, Tk, Tj, Tw, Ty, Tx, Tf, Th;
Chris@82 306 V Tg, TA, TJ, TW, TX, T1h, T1i, TM, TN, Te, Tn, T28, T29, T2b, T2c;
Chris@82 307 V T14, T17, T1b, T1e;
Chris@82 308 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@82 309 TP = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 310 T3 = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 311 T4 = VCONJ(T3);
Chris@82 312 TQ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@82 313 TR = VCONJ(TQ);
Chris@82 314 TH = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 315 TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@82 316 TG = VCONJ(TF);
Chris@82 317 TI = VSUB(TG, TH);
Chris@82 318 T1d = VADD(TG, TH);
Chris@82 319 T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@82 320 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 321 T8 = VCONJ(T7);
Chris@82 322 T9 = VSUB(T6, T8);
Chris@82 323 T12 = VADD(T6, T8);
Chris@82 324 Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@82 325 Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 326 Tb = VCONJ(Ta);
Chris@82 327 Td = VSUB(Tb, Tc);
Chris@82 328 T15 = VADD(Tb, Tc);
Chris@82 329 TD = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 330 TB = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@82 331 TC = VCONJ(TB);
Chris@82 332 TE = VSUB(TC, TD);
Chris@82 333 T1a = VADD(TC, TD);
Chris@82 334 Ts = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 335 Tt = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@82 336 Tu = VCONJ(Tt);
Chris@82 337 Tv = VSUB(Ts, Tu);
Chris@82 338 T13 = VADD(Ts, Tu);
Chris@82 339 Tl = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@82 340 Tj = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 341 Tk = VCONJ(Tj);
Chris@82 342 Tm = VSUB(Tk, Tl);
Chris@82 343 T1c = VADD(Tk, Tl);
Chris@82 344 Tw = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 345 Tx = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
Chris@82 346 Ty = VCONJ(Tx);
Chris@82 347 Tz = VSUB(Tw, Ty);
Chris@82 348 T16 = VADD(Tw, Ty);
Chris@82 349 Tf = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
Chris@82 350 Tg = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 351 Th = VCONJ(Tg);
Chris@82 352 Ti = VSUB(Tf, Th);
Chris@82 353 T19 = VADD(Tf, Th);
Chris@82 354 TA = VSUB(Tv, Tz);
Chris@82 355 TJ = VSUB(TE, TI);
Chris@82 356 TK = VFNMS(LDK(KP951056516), TJ, VMUL(LDK(KP587785252), TA));
Chris@82 357 T1v = VFMA(LDK(KP951056516), TA, VMUL(LDK(KP587785252), TJ));
Chris@82 358 TW = VSUB(T9, Td);
Chris@82 359 TX = VSUB(Ti, Tm);
Chris@82 360 TY = VFNMS(LDK(KP951056516), TX, VMUL(LDK(KP587785252), TW));
Chris@82 361 T1x = VFMA(LDK(KP951056516), TW, VMUL(LDK(KP587785252), TX));
Chris@82 362 T1h = VADD(T2, T4);
Chris@82 363 T1i = VADD(TP, TR);
Chris@82 364 T1j = VSUB(T1h, T1i);
Chris@82 365 T2f = VADD(T1h, T1i);
Chris@82 366 TS = VSUB(TP, TR);
Chris@82 367 TM = VADD(Tv, Tz);
Chris@82 368 TN = VADD(TE, TI);
Chris@82 369 TT = VADD(TM, TN);
Chris@82 370 TO = VMUL(LDK(KP559016994), VSUB(TM, TN));
Chris@82 371 TU = VFNMS(LDK(KP250000000), TT, TS);
Chris@82 372 T5 = VSUB(T2, T4);
Chris@82 373 Te = VADD(T9, Td);
Chris@82 374 Tn = VADD(Ti, Tm);
Chris@82 375 To = VADD(Te, Tn);
Chris@82 376 Tp = VFNMS(LDK(KP250000000), To, T5);
Chris@82 377 Tq = VMUL(LDK(KP559016994), VSUB(Te, Tn));
Chris@82 378 T28 = VADD(T12, T13);
Chris@82 379 T29 = VADD(T15, T16);
Chris@82 380 T2a = VADD(T28, T29);
Chris@82 381 T2b = VADD(T19, T1a);
Chris@82 382 T2c = VADD(T1c, T1d);
Chris@82 383 T2d = VADD(T2b, T2c);
Chris@82 384 T2g = VADD(T2a, T2d);
Chris@82 385 T2k = VSUB(T2b, T2c);
Chris@82 386 T2j = VSUB(T28, T29);
Chris@82 387 T14 = VSUB(T12, T13);
Chris@82 388 T17 = VSUB(T15, T16);
Chris@82 389 T1k = VADD(T14, T17);
Chris@82 390 T1b = VSUB(T19, T1a);
Chris@82 391 T1e = VSUB(T1c, T1d);
Chris@82 392 T1l = VADD(T1b, T1e);
Chris@82 393 T18 = VSUB(T14, T17);
Chris@82 394 T1m = VADD(T1k, T1l);
Chris@82 395 T1f = VSUB(T1b, T1e);
Chris@82 396 }
Chris@82 397 {
Chris@82 398 V T2L, T22, T1S, T26, T2m, T2G, T2s, T2A, T1q, T1U, T1C, T1M, T10, T2E, T1I;
Chris@82 399 V T2q, T1A, T2K, T20, T2w, T21, T1Q, T1R, T1P, T25, T1r, T1s, T2C, T2N, T1N;
Chris@82 400 V T2H, T2I, T2M, T1E, T1D, T1O, T1V, T2n, T2B, T24, T2o, T2t, T2u, T23, T1W;
Chris@82 401 T2L = VADD(T2f, T2g);
Chris@82 402 T21 = LDW(&(W[TWVL * 18]));
Chris@82 403 T22 = VZMUL(T21, VADD(T1j, T1m));
Chris@82 404 T1Q = VADD(T5, To);
Chris@82 405 T1R = VBYI(VADD(TS, TT));
Chris@82 406 T1P = LDW(&(W[TWVL * 28]));
Chris@82 407 T1S = VZMULI(T1P, VSUB(T1Q, T1R));
Chris@82 408 T25 = LDW(&(W[TWVL * 8]));
Chris@82 409 T26 = VZMULI(T25, VADD(T1Q, T1R));
Chris@82 410 {
Chris@82 411 V T2l, T2z, T2i, T2y, T2e, T2h, T27, T2F, T2r, T2x, T1g, T1K, T1p, T1L, T1n;
Chris@82 412 V T1o, T11, T1T, T1B, T1J, TL, T1G, TZ, T1H, Tr, TV, T1, T2D, T1F, T2p;
Chris@82 413 V T1w, T1Y, T1z, T1Z, T1u, T1y, T1t, T2J, T1X, T2v;
Chris@82 414 T2l = VBYI(VFMA(LDK(KP951056516), T2j, VMUL(LDK(KP587785252), T2k)));
Chris@82 415 T2z = VBYI(VFNMS(LDK(KP951056516), T2k, VMUL(LDK(KP587785252), T2j)));
Chris@82 416 T2e = VMUL(LDK(KP559016994), VSUB(T2a, T2d));
Chris@82 417 T2h = VFNMS(LDK(KP250000000), T2g, T2f);
Chris@82 418 T2i = VADD(T2e, T2h);
Chris@82 419 T2y = VSUB(T2h, T2e);
Chris@82 420 T27 = LDW(&(W[TWVL * 6]));
Chris@82 421 T2m = VZMUL(T27, VSUB(T2i, T2l));
Chris@82 422 T2F = LDW(&(W[TWVL * 22]));
Chris@82 423 T2G = VZMUL(T2F, VADD(T2z, T2y));
Chris@82 424 T2r = LDW(&(W[TWVL * 30]));
Chris@82 425 T2s = VZMUL(T2r, VADD(T2l, T2i));
Chris@82 426 T2x = LDW(&(W[TWVL * 14]));
Chris@82 427 T2A = VZMUL(T2x, VSUB(T2y, T2z));
Chris@82 428 T1g = VBYI(VFNMS(LDK(KP951056516), T1f, VMUL(LDK(KP587785252), T18)));
Chris@82 429 T1K = VBYI(VFMA(LDK(KP951056516), T18, VMUL(LDK(KP587785252), T1f)));
Chris@82 430 T1n = VFNMS(LDK(KP250000000), T1m, T1j);
Chris@82 431 T1o = VMUL(LDK(KP559016994), VSUB(T1k, T1l));
Chris@82 432 T1p = VSUB(T1n, T1o);
Chris@82 433 T1L = VADD(T1o, T1n);
Chris@82 434 T11 = LDW(&(W[TWVL * 2]));
Chris@82 435 T1q = VZMUL(T11, VADD(T1g, T1p));
Chris@82 436 T1T = LDW(&(W[TWVL * 26]));
Chris@82 437 T1U = VZMUL(T1T, VSUB(T1L, T1K));
Chris@82 438 T1B = LDW(&(W[TWVL * 34]));
Chris@82 439 T1C = VZMUL(T1B, VSUB(T1p, T1g));
Chris@82 440 T1J = LDW(&(W[TWVL * 10]));
Chris@82 441 T1M = VZMUL(T1J, VADD(T1K, T1L));
Chris@82 442 Tr = VSUB(Tp, Tq);
Chris@82 443 TL = VSUB(Tr, TK);
Chris@82 444 T1G = VADD(Tr, TK);
Chris@82 445 TV = VSUB(TO, TU);
Chris@82 446 TZ = VBYI(VSUB(TV, TY));
Chris@82 447 T1H = VBYI(VADD(TY, TV));
Chris@82 448 T1 = LDW(&(W[TWVL * 4]));
Chris@82 449 T10 = VZMULI(T1, VADD(TL, TZ));
Chris@82 450 T2D = LDW(&(W[TWVL * 24]));
Chris@82 451 T2E = VZMULI(T2D, VSUB(T1G, T1H));
Chris@82 452 T1F = LDW(&(W[TWVL * 12]));
Chris@82 453 T1I = VZMULI(T1F, VADD(T1G, T1H));
Chris@82 454 T2p = LDW(&(W[TWVL * 32]));
Chris@82 455 T2q = VZMULI(T2p, VSUB(TL, TZ));
Chris@82 456 T1u = VADD(Tq, Tp);
Chris@82 457 T1w = VSUB(T1u, T1v);
Chris@82 458 T1Y = VADD(T1u, T1v);
Chris@82 459 T1y = VADD(TO, TU);
Chris@82 460 T1z = VBYI(VADD(T1x, T1y));
Chris@82 461 T1Z = VBYI(VSUB(T1y, T1x));
Chris@82 462 T1t = LDW(&(W[TWVL * 36]));
Chris@82 463 T1A = VZMULI(T1t, VSUB(T1w, T1z));
Chris@82 464 T2J = LDW(&(W[0]));
Chris@82 465 T2K = VZMULI(T2J, VADD(T1w, T1z));
Chris@82 466 T1X = LDW(&(W[TWVL * 20]));
Chris@82 467 T20 = VZMULI(T1X, VSUB(T1Y, T1Z));
Chris@82 468 T2v = LDW(&(W[TWVL * 16]));
Chris@82 469 T2w = VZMULI(T2v, VADD(T1Y, T1Z));
Chris@82 470 }
Chris@82 471 T1r = VADD(T10, T1q);
Chris@82 472 ST(&(Rp[WS(rs, 1)]), T1r, ms, &(Rp[WS(rs, 1)]));
Chris@82 473 T1s = VCONJ(VSUB(T1q, T10));
Chris@82 474 ST(&(Rm[WS(rs, 1)]), T1s, -ms, &(Rm[WS(rs, 1)]));
Chris@82 475 T2C = VCONJ(VSUB(T2A, T2w));
Chris@82 476 ST(&(Rm[WS(rs, 4)]), T2C, -ms, &(Rm[0]));
Chris@82 477 T2N = VCONJ(VSUB(T2L, T2K));
Chris@82 478 ST(&(Rm[0]), T2N, -ms, &(Rm[0]));
Chris@82 479 T1N = VADD(T1I, T1M);
Chris@82 480 ST(&(Rp[WS(rs, 3)]), T1N, ms, &(Rp[WS(rs, 1)]));
Chris@82 481 T2H = VADD(T2E, T2G);
Chris@82 482 ST(&(Rp[WS(rs, 6)]), T2H, ms, &(Rp[0]));
Chris@82 483 T2I = VCONJ(VSUB(T2G, T2E));
Chris@82 484 ST(&(Rm[WS(rs, 6)]), T2I, -ms, &(Rm[0]));
Chris@82 485 T2M = VADD(T2K, T2L);
Chris@82 486 ST(&(Rp[0]), T2M, ms, &(Rp[0]));
Chris@82 487 T1E = VCONJ(VSUB(T1C, T1A));
Chris@82 488 ST(&(Rm[WS(rs, 9)]), T1E, -ms, &(Rm[WS(rs, 1)]));
Chris@82 489 T1D = VADD(T1A, T1C);
Chris@82 490 ST(&(Rp[WS(rs, 9)]), T1D, ms, &(Rp[WS(rs, 1)]));
Chris@82 491 T1O = VCONJ(VSUB(T1M, T1I));
Chris@82 492 ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
Chris@82 493 T1V = VADD(T1S, T1U);
Chris@82 494 ST(&(Rp[WS(rs, 7)]), T1V, ms, &(Rp[WS(rs, 1)]));
Chris@82 495 T2n = VADD(T26, T2m);
Chris@82 496 ST(&(Rp[WS(rs, 2)]), T2n, ms, &(Rp[0]));
Chris@82 497 T2B = VADD(T2w, T2A);
Chris@82 498 ST(&(Rp[WS(rs, 4)]), T2B, ms, &(Rp[0]));
Chris@82 499 T24 = VCONJ(VSUB(T22, T20));
Chris@82 500 ST(&(Rm[WS(rs, 5)]), T24, -ms, &(Rm[WS(rs, 1)]));
Chris@82 501 T2o = VCONJ(VSUB(T2m, T26));
Chris@82 502 ST(&(Rm[WS(rs, 2)]), T2o, -ms, &(Rm[0]));
Chris@82 503 T2t = VADD(T2q, T2s);
Chris@82 504 ST(&(Rp[WS(rs, 8)]), T2t, ms, &(Rp[0]));
Chris@82 505 T2u = VCONJ(VSUB(T2s, T2q));
Chris@82 506 ST(&(Rm[WS(rs, 8)]), T2u, -ms, &(Rm[0]));
Chris@82 507 T23 = VADD(T20, T22);
Chris@82 508 ST(&(Rp[WS(rs, 5)]), T23, ms, &(Rp[WS(rs, 1)]));
Chris@82 509 T1W = VCONJ(VSUB(T1U, T1S));
Chris@82 510 ST(&(Rm[WS(rs, 7)]), T1W, -ms, &(Rm[WS(rs, 1)]));
Chris@82 511 }
Chris@82 512 }
Chris@82 513 }
Chris@82 514 VLEAVE();
Chris@82 515 }
Chris@82 516
Chris@82 517 static const tw_instr twinstr[] = {
Chris@82 518 VTW(1, 1),
Chris@82 519 VTW(1, 2),
Chris@82 520 VTW(1, 3),
Chris@82 521 VTW(1, 4),
Chris@82 522 VTW(1, 5),
Chris@82 523 VTW(1, 6),
Chris@82 524 VTW(1, 7),
Chris@82 525 VTW(1, 8),
Chris@82 526 VTW(1, 9),
Chris@82 527 VTW(1, 10),
Chris@82 528 VTW(1, 11),
Chris@82 529 VTW(1, 12),
Chris@82 530 VTW(1, 13),
Chris@82 531 VTW(1, 14),
Chris@82 532 VTW(1, 15),
Chris@82 533 VTW(1, 16),
Chris@82 534 VTW(1, 17),
Chris@82 535 VTW(1, 18),
Chris@82 536 VTW(1, 19),
Chris@82 537 {TW_NEXT, VL, 0}
Chris@82 538 };
Chris@82 539
Chris@82 540 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cbdftv_20"), twinstr, &GENUS, {131, 50, 12, 0} };
Chris@82 541
Chris@82 542 void XSIMD(codelet_hc2cbdftv_20) (planner *p) {
Chris@82 543 X(khc2c_register) (p, hc2cbdftv_20, &desc, HC2C_VIA_DFT);
Chris@82 544 }
Chris@82 545 #endif