annotate src/fftw-3.3.8/rdft/simd/common/hc2cbdftv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:08:12 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include rdft/simd/hc2cbv.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 249 FP additions, 192 FP multiplications,
Chris@82 32 * (or, 119 additions, 62 multiplications, 130 fused multiply/add),
Chris@82 33 * 143 stack variables, 7 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/simd/hc2cbv.h"
Chris@82 36
Chris@82 37 static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 40 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 41 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 46 {
Chris@82 47 INT m;
Chris@82 48 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
Chris@82 49 V Ts, T1S, T3p, T45, T3A, T48, T1b, T1V, T1o, T2G, T2o, T2Y, T2z, T31, T1L;
Chris@82 50 V T2H, T2J, T2K, TJ, T1c, T3D, T46, T10, T1d, T2r, T2A, T3w, T49, T1D, T1M;
Chris@82 51 V T2u, T2B;
Chris@82 52 {
Chris@82 53 V T4, T1i, T15, T1j, Tb, T1m, T16, T1l, T1G, T1F, Tj, T3m, T18, T1J, T1I;
Chris@82 54 V Tq, T3n, T19, T2, T3, T13, T14, T5, T6, T7, T8, T9, Ta, Tf, Ti;
Chris@82 55 V Td, Te, Tg, Th, Tm, Tp, Tk, Tl, Tn, To, Tc, Tr, T3l, T3o, T3y;
Chris@82 56 V T3z, T17, T1a, T1k, T1n, T2m, T2n, T2x, T2y, T1H, T1K;
Chris@82 57 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@82 58 T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 59 T4 = VFNMSCONJ(T3, T2);
Chris@82 60 T1i = VFMACONJ(T3, T2);
Chris@82 61 T13 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
Chris@82 62 T14 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 63 T15 = VFNMSCONJ(T14, T13);
Chris@82 64 T1j = VFMACONJ(T14, T13);
Chris@82 65 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@82 66 T6 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 67 T7 = VFNMSCONJ(T6, T5);
Chris@82 68 T8 = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
Chris@82 69 T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 70 Ta = VFMSCONJ(T9, T8);
Chris@82 71 Tb = VADD(T7, Ta);
Chris@82 72 T1m = VFMACONJ(T9, T8);
Chris@82 73 T16 = VSUB(T7, Ta);
Chris@82 74 T1l = VFMACONJ(T6, T5);
Chris@82 75 Td = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
Chris@82 76 Te = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 77 Tf = VFNMSCONJ(Te, Td);
Chris@82 78 T1G = VFMACONJ(Te, Td);
Chris@82 79 Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@82 80 Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 81 Ti = VFNMSCONJ(Th, Tg);
Chris@82 82 T1F = VFMACONJ(Th, Tg);
Chris@82 83 Tj = VFMA(LDK(KP414213562), Ti, Tf);
Chris@82 84 T3m = VSUB(T1F, T1G);
Chris@82 85 T18 = VFNMS(LDK(KP414213562), Tf, Ti);
Chris@82 86 Tk = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@82 87 Tl = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 88 Tm = VFNMSCONJ(Tl, Tk);
Chris@82 89 T1J = VFMACONJ(Tl, Tk);
Chris@82 90 Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
Chris@82 91 To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 92 Tp = VFMSCONJ(To, Tn);
Chris@82 93 T1I = VFMACONJ(To, Tn);
Chris@82 94 Tq = VFNMS(LDK(KP414213562), Tp, Tm);
Chris@82 95 T3n = VSUB(T1I, T1J);
Chris@82 96 T19 = VFMA(LDK(KP414213562), Tm, Tp);
Chris@82 97 Tc = VFNMS(LDK(KP707106781), Tb, T4);
Chris@82 98 Tr = VSUB(Tj, Tq);
Chris@82 99 Ts = VFMA(LDK(KP923879532), Tr, Tc);
Chris@82 100 T1S = VFNMS(LDK(KP923879532), Tr, Tc);
Chris@82 101 T3l = VSUB(T1i, T1j);
Chris@82 102 T3o = VADD(T3m, T3n);
Chris@82 103 T3p = VFMA(LDK(KP707106781), T3o, T3l);
Chris@82 104 T45 = VFNMS(LDK(KP707106781), T3o, T3l);
Chris@82 105 T3y = VSUB(T1l, T1m);
Chris@82 106 T3z = VSUB(T3m, T3n);
Chris@82 107 T3A = VFMA(LDK(KP707106781), T3z, T3y);
Chris@82 108 T48 = VFNMS(LDK(KP707106781), T3z, T3y);
Chris@82 109 T17 = VFNMS(LDK(KP707106781), T16, T15);
Chris@82 110 T1a = VSUB(T18, T19);
Chris@82 111 T1b = VFNMS(LDK(KP923879532), T1a, T17);
Chris@82 112 T1V = VFMA(LDK(KP923879532), T1a, T17);
Chris@82 113 T1k = VADD(T1i, T1j);
Chris@82 114 T1n = VADD(T1l, T1m);
Chris@82 115 T1o = VSUB(T1k, T1n);
Chris@82 116 T2G = VADD(T1k, T1n);
Chris@82 117 T2m = VFMA(LDK(KP707106781), Tb, T4);
Chris@82 118 T2n = VADD(T18, T19);
Chris@82 119 T2o = VFNMS(LDK(KP923879532), T2n, T2m);
Chris@82 120 T2Y = VFMA(LDK(KP923879532), T2n, T2m);
Chris@82 121 T2x = VFMA(LDK(KP707106781), T16, T15);
Chris@82 122 T2y = VADD(Tj, Tq);
Chris@82 123 T2z = VFNMS(LDK(KP923879532), T2y, T2x);
Chris@82 124 T31 = VFMA(LDK(KP923879532), T2y, T2x);
Chris@82 125 T1H = VADD(T1F, T1G);
Chris@82 126 T1K = VADD(T1I, T1J);
Chris@82 127 T1L = VSUB(T1H, T1K);
Chris@82 128 T2H = VADD(T1H, T1K);
Chris@82 129 }
Chris@82 130 {
Chris@82 131 V Tv, T3q, TG, T1r, TM, T3t, TX, T1y, TC, T3r, TH, T1u, TT, T3u, TY;
Chris@82 132 V T1B, Tt, Tu, T1p, TE, TF, T1q, TK, TL, T1w, TV, TW, T1x, Ty, T1s;
Chris@82 133 V TB, T1t, Tw, Tx, Tz, TA, TP, T1z, TS, T1A, TN, TO, TQ, TR, TD;
Chris@82 134 V TI, T3B, T3C, TU, TZ, T2p, T2q, T3s, T3v, T1v, T1C, T2s, T2t;
Chris@82 135 Tt = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 136 Tu = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
Chris@82 137 T1p = VFMACONJ(Tu, Tt);
Chris@82 138 TE = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 139 TF = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@82 140 T1q = VFMACONJ(TF, TE);
Chris@82 141 Tv = VFNMSCONJ(Tu, Tt);
Chris@82 142 T3q = VSUB(T1p, T1q);
Chris@82 143 TG = VFNMSCONJ(TF, TE);
Chris@82 144 T1r = VADD(T1p, T1q);
Chris@82 145 TK = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 146 TL = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@82 147 T1w = VFMACONJ(TL, TK);
Chris@82 148 TV = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 149 TW = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
Chris@82 150 T1x = VFMACONJ(TW, TV);
Chris@82 151 TM = VFMSCONJ(TL, TK);
Chris@82 152 T3t = VSUB(T1w, T1x);
Chris@82 153 TX = VFNMSCONJ(TW, TV);
Chris@82 154 T1y = VADD(T1w, T1x);
Chris@82 155 Tw = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 156 Tx = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
Chris@82 157 Ty = VFNMSCONJ(Tx, Tw);
Chris@82 158 T1s = VFMACONJ(Tx, Tw);
Chris@82 159 Tz = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 160 TA = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@82 161 TB = VFMSCONJ(TA, Tz);
Chris@82 162 T1t = VFMACONJ(TA, Tz);
Chris@82 163 TC = VADD(Ty, TB);
Chris@82 164 T3r = VSUB(T1s, T1t);
Chris@82 165 TH = VSUB(Ty, TB);
Chris@82 166 T1u = VADD(T1s, T1t);
Chris@82 167 TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 168 TO = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
Chris@82 169 TP = VFNMSCONJ(TO, TN);
Chris@82 170 T1z = VFMACONJ(TO, TN);
Chris@82 171 TQ = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 172 TR = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@82 173 TS = VFMSCONJ(TR, TQ);
Chris@82 174 T1A = VFMACONJ(TR, TQ);
Chris@82 175 TT = VADD(TP, TS);
Chris@82 176 T3u = VSUB(T1A, T1z);
Chris@82 177 TY = VSUB(TS, TP);
Chris@82 178 T1B = VADD(T1z, T1A);
Chris@82 179 T2J = VADD(T1r, T1u);
Chris@82 180 T2K = VADD(T1y, T1B);
Chris@82 181 TD = VFNMS(LDK(KP707106781), TC, Tv);
Chris@82 182 TI = VFNMS(LDK(KP707106781), TH, TG);
Chris@82 183 TJ = VFMA(LDK(KP668178637), TI, TD);
Chris@82 184 T1c = VFNMS(LDK(KP668178637), TD, TI);
Chris@82 185 T3B = VFMA(LDK(KP414213562), T3q, T3r);
Chris@82 186 T3C = VFMA(LDK(KP414213562), T3t, T3u);
Chris@82 187 T3D = VSUB(T3B, T3C);
Chris@82 188 T46 = VADD(T3B, T3C);
Chris@82 189 TU = VFNMS(LDK(KP707106781), TT, TM);
Chris@82 190 TZ = VFMA(LDK(KP707106781), TY, TX);
Chris@82 191 T10 = VFNMS(LDK(KP668178637), TZ, TU);
Chris@82 192 T1d = VFMA(LDK(KP668178637), TU, TZ);
Chris@82 193 T2p = VFMA(LDK(KP707106781), TH, TG);
Chris@82 194 T2q = VFMA(LDK(KP707106781), TC, Tv);
Chris@82 195 T2r = VFMA(LDK(KP198912367), T2q, T2p);
Chris@82 196 T2A = VFNMS(LDK(KP198912367), T2p, T2q);
Chris@82 197 T3s = VFNMS(LDK(KP414213562), T3r, T3q);
Chris@82 198 T3v = VFNMS(LDK(KP414213562), T3u, T3t);
Chris@82 199 T3w = VADD(T3s, T3v);
Chris@82 200 T49 = VSUB(T3s, T3v);
Chris@82 201 T1v = VSUB(T1r, T1u);
Chris@82 202 T1C = VSUB(T1y, T1B);
Chris@82 203 T1D = VADD(T1v, T1C);
Chris@82 204 T1M = VSUB(T1v, T1C);
Chris@82 205 T2s = VFNMS(LDK(KP707106781), TY, TX);
Chris@82 206 T2t = VFMA(LDK(KP707106781), TT, TM);
Chris@82 207 T2u = VFNMS(LDK(KP198912367), T2t, T2s);
Chris@82 208 T2B = VFMA(LDK(KP198912367), T2s, T2t);
Chris@82 209 }
Chris@82 210 {
Chris@82 211 V T3f, T38, T4p, T4v, T3T, T3Z, T2a, T2i, T4b, T4h, T1O, T20, T2M, T2U, T3F;
Chris@82 212 V T3L, T1g, T3X, T2g, T3J, T2E, T4l, T2S, T4f, T1Y, T4t, T26, T43, T34, T3P;
Chris@82 213 V T3e, T3j, T36, T37, T35, T4n, T4o, T4m, T4u, T3R, T3S, T3Q, T3Y, T28, T29;
Chris@82 214 V T27, T2h, T47, T4a, T44, T4g, T1E, T1N, T1h, T1Z;
Chris@82 215 T36 = VADD(T2G, T2H);
Chris@82 216 T37 = VADD(T2J, T2K);
Chris@82 217 T3f = VADD(T36, T37);
Chris@82 218 T35 = LDW(&(W[TWVL * 30]));
Chris@82 219 T38 = VZMUL(T35, VSUB(T36, T37));
Chris@82 220 T4n = VFMA(LDK(KP923879532), T46, T45);
Chris@82 221 T4o = VFNMS(LDK(KP923879532), T49, T48);
Chris@82 222 T4m = LDW(&(W[TWVL * 10]));
Chris@82 223 T4p = VZMUL(T4m, VFNMSI(T4o, T4n));
Chris@82 224 T4u = LDW(&(W[TWVL * 50]));
Chris@82 225 T4v = VZMUL(T4u, VFMAI(T4o, T4n));
Chris@82 226 T3R = VFMA(LDK(KP923879532), T3w, T3p);
Chris@82 227 T3S = VFMA(LDK(KP923879532), T3D, T3A);
Chris@82 228 T3Q = LDW(&(W[TWVL * 58]));
Chris@82 229 T3T = VZMUL(T3Q, VFNMSI(T3S, T3R));
Chris@82 230 T3Y = LDW(&(W[TWVL * 2]));
Chris@82 231 T3Z = VZMUL(T3Y, VFMAI(T3S, T3R));
Chris@82 232 T28 = VFMA(LDK(KP707106781), T1D, T1o);
Chris@82 233 T29 = VFMA(LDK(KP707106781), T1M, T1L);
Chris@82 234 T27 = LDW(&(W[TWVL * 6]));
Chris@82 235 T2a = VZMUL(T27, VFMAI(T29, T28));
Chris@82 236 T2h = LDW(&(W[TWVL * 54]));
Chris@82 237 T2i = VZMUL(T2h, VFNMSI(T29, T28));
Chris@82 238 T47 = VFNMS(LDK(KP923879532), T46, T45);
Chris@82 239 T4a = VFMA(LDK(KP923879532), T49, T48);
Chris@82 240 T44 = LDW(&(W[TWVL * 18]));
Chris@82 241 T4b = VZMUL(T44, VFMAI(T4a, T47));
Chris@82 242 T4g = LDW(&(W[TWVL * 42]));
Chris@82 243 T4h = VZMUL(T4g, VFNMSI(T4a, T47));
Chris@82 244 T1E = VFNMS(LDK(KP707106781), T1D, T1o);
Chris@82 245 T1N = VFNMS(LDK(KP707106781), T1M, T1L);
Chris@82 246 T1h = LDW(&(W[TWVL * 22]));
Chris@82 247 T1O = VZMUL(T1h, VFNMSI(T1N, T1E));
Chris@82 248 T1Z = LDW(&(W[TWVL * 38]));
Chris@82 249 T20 = VZMUL(T1Z, VFMAI(T1N, T1E));
Chris@82 250 {
Chris@82 251 V T2I, T2L, T2F, T2T, T3x, T3E, T3k, T3K, T12, T2e, T1f, T2f, T11, T1e, T1;
Chris@82 252 V T3W, T2d, T3I, T2w, T2Q, T2D, T2R, T2v, T2C, T2l, T4k, T2P, T4e, T1U, T24;
Chris@82 253 V T1X, T25, T1T, T1W, T1R, T4s, T23, T42, T30, T3c, T33, T3d, T2Z, T32, T2X;
Chris@82 254 V T3O, T3b, T3i;
Chris@82 255 T2I = VSUB(T2G, T2H);
Chris@82 256 T2L = VSUB(T2J, T2K);
Chris@82 257 T2F = LDW(&(W[TWVL * 46]));
Chris@82 258 T2M = VZMUL(T2F, VFNMSI(T2L, T2I));
Chris@82 259 T2T = LDW(&(W[TWVL * 14]));
Chris@82 260 T2U = VZMUL(T2T, VFMAI(T2L, T2I));
Chris@82 261 T3x = VFNMS(LDK(KP923879532), T3w, T3p);
Chris@82 262 T3E = VFNMS(LDK(KP923879532), T3D, T3A);
Chris@82 263 T3k = LDW(&(W[TWVL * 26]));
Chris@82 264 T3F = VZMUL(T3k, VFNMSI(T3E, T3x));
Chris@82 265 T3K = LDW(&(W[TWVL * 34]));
Chris@82 266 T3L = VZMUL(T3K, VFMAI(T3E, T3x));
Chris@82 267 T11 = VADD(TJ, T10);
Chris@82 268 T12 = VFNMS(LDK(KP831469612), T11, Ts);
Chris@82 269 T2e = VFMA(LDK(KP831469612), T11, Ts);
Chris@82 270 T1e = VADD(T1c, T1d);
Chris@82 271 T1f = VFNMS(LDK(KP831469612), T1e, T1b);
Chris@82 272 T2f = VFMA(LDK(KP831469612), T1e, T1b);
Chris@82 273 T1 = LDW(&(W[TWVL * 24]));
Chris@82 274 T1g = VZMULI(T1, VFMAI(T1f, T12));
Chris@82 275 T3W = LDW(&(W[TWVL * 4]));
Chris@82 276 T3X = VZMULI(T3W, VFNMSI(T2f, T2e));
Chris@82 277 T2d = LDW(&(W[TWVL * 56]));
Chris@82 278 T2g = VZMULI(T2d, VFMAI(T2f, T2e));
Chris@82 279 T3I = LDW(&(W[TWVL * 36]));
Chris@82 280 T3J = VZMULI(T3I, VFNMSI(T1f, T12));
Chris@82 281 T2v = VSUB(T2r, T2u);
Chris@82 282 T2w = VFMA(LDK(KP980785280), T2v, T2o);
Chris@82 283 T2Q = VFNMS(LDK(KP980785280), T2v, T2o);
Chris@82 284 T2C = VSUB(T2A, T2B);
Chris@82 285 T2D = VFNMS(LDK(KP980785280), T2C, T2z);
Chris@82 286 T2R = VFMA(LDK(KP980785280), T2C, T2z);
Chris@82 287 T2l = LDW(&(W[TWVL * 48]));
Chris@82 288 T2E = VZMULI(T2l, VFMAI(T2D, T2w));
Chris@82 289 T4k = LDW(&(W[TWVL * 12]));
Chris@82 290 T4l = VZMULI(T4k, VFNMSI(T2D, T2w));
Chris@82 291 T2P = LDW(&(W[TWVL * 16]));
Chris@82 292 T2S = VZMULI(T2P, VFMAI(T2R, T2Q));
Chris@82 293 T4e = LDW(&(W[TWVL * 44]));
Chris@82 294 T4f = VZMULI(T4e, VFNMSI(T2R, T2Q));
Chris@82 295 T1T = VSUB(T1d, T1c);
Chris@82 296 T1U = VFNMS(LDK(KP831469612), T1T, T1S);
Chris@82 297 T24 = VFMA(LDK(KP831469612), T1T, T1S);
Chris@82 298 T1W = VSUB(TJ, T10);
Chris@82 299 T1X = VFNMS(LDK(KP831469612), T1W, T1V);
Chris@82 300 T25 = VFMA(LDK(KP831469612), T1W, T1V);
Chris@82 301 T1R = LDW(&(W[TWVL * 40]));
Chris@82 302 T1Y = VZMULI(T1R, VFMAI(T1X, T1U));
Chris@82 303 T4s = LDW(&(W[TWVL * 52]));
Chris@82 304 T4t = VZMULI(T4s, VFNMSI(T25, T24));
Chris@82 305 T23 = LDW(&(W[TWVL * 8]));
Chris@82 306 T26 = VZMULI(T23, VFMAI(T25, T24));
Chris@82 307 T42 = LDW(&(W[TWVL * 20]));
Chris@82 308 T43 = VZMULI(T42, VFNMSI(T1X, T1U));
Chris@82 309 T2Z = VADD(T2A, T2B);
Chris@82 310 T30 = VFNMS(LDK(KP980785280), T2Z, T2Y);
Chris@82 311 T3c = VFMA(LDK(KP980785280), T2Z, T2Y);
Chris@82 312 T32 = VADD(T2r, T2u);
Chris@82 313 T33 = VFNMS(LDK(KP980785280), T32, T31);
Chris@82 314 T3d = VFMA(LDK(KP980785280), T32, T31);
Chris@82 315 T2X = LDW(&(W[TWVL * 32]));
Chris@82 316 T34 = VZMULI(T2X, VFMAI(T33, T30));
Chris@82 317 T3O = LDW(&(W[TWVL * 60]));
Chris@82 318 T3P = VZMULI(T3O, VFNMSI(T3d, T3c));
Chris@82 319 T3b = LDW(&(W[0]));
Chris@82 320 T3e = VZMULI(T3b, VFMAI(T3d, T3c));
Chris@82 321 T3i = LDW(&(W[TWVL * 28]));
Chris@82 322 T3j = VZMULI(T3i, VFNMSI(T33, T30));
Chris@82 323 }
Chris@82 324 {
Chris@82 325 V T1P, T4w, T2j, T4c, T4x, T1Q, T4d, T2k, T21, T4q, T2b, T4i, T4r, T22, T4j;
Chris@82 326 V T2c, T2N, T40, T3g, T3G, T41, T2O, T3H, T3h, T2V, T3U, T39, T3M, T3V, T2W;
Chris@82 327 V T3N, T3a;
Chris@82 328 T1P = VADD(T1g, T1O);
Chris@82 329 ST(&(Rp[WS(rs, 6)]), T1P, ms, &(Rp[0]));
Chris@82 330 T4w = VADD(T4t, T4v);
Chris@82 331 ST(&(Rp[WS(rs, 13)]), T4w, ms, &(Rp[WS(rs, 1)]));
Chris@82 332 T2j = VADD(T2g, T2i);
Chris@82 333 ST(&(Rp[WS(rs, 14)]), T2j, ms, &(Rp[0]));
Chris@82 334 T4c = VADD(T43, T4b);
Chris@82 335 ST(&(Rp[WS(rs, 5)]), T4c, ms, &(Rp[WS(rs, 1)]));
Chris@82 336 T4x = VCONJ(VSUB(T4v, T4t));
Chris@82 337 ST(&(Rm[WS(rs, 13)]), T4x, -ms, &(Rm[WS(rs, 1)]));
Chris@82 338 T1Q = VCONJ(VSUB(T1O, T1g));
Chris@82 339 ST(&(Rm[WS(rs, 6)]), T1Q, -ms, &(Rm[0]));
Chris@82 340 T4d = VCONJ(VSUB(T4b, T43));
Chris@82 341 ST(&(Rm[WS(rs, 5)]), T4d, -ms, &(Rm[WS(rs, 1)]));
Chris@82 342 T2k = VCONJ(VSUB(T2i, T2g));
Chris@82 343 ST(&(Rm[WS(rs, 14)]), T2k, -ms, &(Rm[0]));
Chris@82 344 T21 = VADD(T1Y, T20);
Chris@82 345 ST(&(Rp[WS(rs, 10)]), T21, ms, &(Rp[0]));
Chris@82 346 T4q = VADD(T4l, T4p);
Chris@82 347 ST(&(Rp[WS(rs, 3)]), T4q, ms, &(Rp[WS(rs, 1)]));
Chris@82 348 T2b = VADD(T26, T2a);
Chris@82 349 ST(&(Rp[WS(rs, 2)]), T2b, ms, &(Rp[0]));
Chris@82 350 T4i = VADD(T4f, T4h);
Chris@82 351 ST(&(Rp[WS(rs, 11)]), T4i, ms, &(Rp[WS(rs, 1)]));
Chris@82 352 T4r = VCONJ(VSUB(T4p, T4l));
Chris@82 353 ST(&(Rm[WS(rs, 3)]), T4r, -ms, &(Rm[WS(rs, 1)]));
Chris@82 354 T22 = VCONJ(VSUB(T20, T1Y));
Chris@82 355 ST(&(Rm[WS(rs, 10)]), T22, -ms, &(Rm[0]));
Chris@82 356 T4j = VCONJ(VSUB(T4h, T4f));
Chris@82 357 ST(&(Rm[WS(rs, 11)]), T4j, -ms, &(Rm[WS(rs, 1)]));
Chris@82 358 T2c = VCONJ(VSUB(T2a, T26));
Chris@82 359 ST(&(Rm[WS(rs, 2)]), T2c, -ms, &(Rm[0]));
Chris@82 360 T2N = VADD(T2E, T2M);
Chris@82 361 ST(&(Rp[WS(rs, 12)]), T2N, ms, &(Rp[0]));
Chris@82 362 T40 = VADD(T3X, T3Z);
Chris@82 363 ST(&(Rp[WS(rs, 1)]), T40, ms, &(Rp[WS(rs, 1)]));
Chris@82 364 T3g = VADD(T3e, T3f);
Chris@82 365 ST(&(Rp[0]), T3g, ms, &(Rp[0]));
Chris@82 366 T3G = VADD(T3j, T3F);
Chris@82 367 ST(&(Rp[WS(rs, 7)]), T3G, ms, &(Rp[WS(rs, 1)]));
Chris@82 368 T41 = VCONJ(VSUB(T3Z, T3X));
Chris@82 369 ST(&(Rm[WS(rs, 1)]), T41, -ms, &(Rm[WS(rs, 1)]));
Chris@82 370 T2O = VCONJ(VSUB(T2M, T2E));
Chris@82 371 ST(&(Rm[WS(rs, 12)]), T2O, -ms, &(Rm[0]));
Chris@82 372 T3H = VCONJ(VSUB(T3F, T3j));
Chris@82 373 ST(&(Rm[WS(rs, 7)]), T3H, -ms, &(Rm[WS(rs, 1)]));
Chris@82 374 T3h = VCONJ(VSUB(T3f, T3e));
Chris@82 375 ST(&(Rm[0]), T3h, -ms, &(Rm[0]));
Chris@82 376 T2V = VADD(T2S, T2U);
Chris@82 377 ST(&(Rp[WS(rs, 4)]), T2V, ms, &(Rp[0]));
Chris@82 378 T3U = VADD(T3P, T3T);
Chris@82 379 ST(&(Rp[WS(rs, 15)]), T3U, ms, &(Rp[WS(rs, 1)]));
Chris@82 380 T39 = VADD(T34, T38);
Chris@82 381 ST(&(Rp[WS(rs, 8)]), T39, ms, &(Rp[0]));
Chris@82 382 T3M = VADD(T3J, T3L);
Chris@82 383 ST(&(Rp[WS(rs, 9)]), T3M, ms, &(Rp[WS(rs, 1)]));
Chris@82 384 T3V = VCONJ(VSUB(T3T, T3P));
Chris@82 385 ST(&(Rm[WS(rs, 15)]), T3V, -ms, &(Rm[WS(rs, 1)]));
Chris@82 386 T2W = VCONJ(VSUB(T2U, T2S));
Chris@82 387 ST(&(Rm[WS(rs, 4)]), T2W, -ms, &(Rm[0]));
Chris@82 388 T3N = VCONJ(VSUB(T3L, T3J));
Chris@82 389 ST(&(Rm[WS(rs, 9)]), T3N, -ms, &(Rm[WS(rs, 1)]));
Chris@82 390 T3a = VCONJ(VSUB(T38, T34));
Chris@82 391 ST(&(Rm[WS(rs, 8)]), T3a, -ms, &(Rm[0]));
Chris@82 392 }
Chris@82 393 }
Chris@82 394 }
Chris@82 395 }
Chris@82 396 VLEAVE();
Chris@82 397 }
Chris@82 398
Chris@82 399 static const tw_instr twinstr[] = {
Chris@82 400 VTW(1, 1),
Chris@82 401 VTW(1, 2),
Chris@82 402 VTW(1, 3),
Chris@82 403 VTW(1, 4),
Chris@82 404 VTW(1, 5),
Chris@82 405 VTW(1, 6),
Chris@82 406 VTW(1, 7),
Chris@82 407 VTW(1, 8),
Chris@82 408 VTW(1, 9),
Chris@82 409 VTW(1, 10),
Chris@82 410 VTW(1, 11),
Chris@82 411 VTW(1, 12),
Chris@82 412 VTW(1, 13),
Chris@82 413 VTW(1, 14),
Chris@82 414 VTW(1, 15),
Chris@82 415 VTW(1, 16),
Chris@82 416 VTW(1, 17),
Chris@82 417 VTW(1, 18),
Chris@82 418 VTW(1, 19),
Chris@82 419 VTW(1, 20),
Chris@82 420 VTW(1, 21),
Chris@82 421 VTW(1, 22),
Chris@82 422 VTW(1, 23),
Chris@82 423 VTW(1, 24),
Chris@82 424 VTW(1, 25),
Chris@82 425 VTW(1, 26),
Chris@82 426 VTW(1, 27),
Chris@82 427 VTW(1, 28),
Chris@82 428 VTW(1, 29),
Chris@82 429 VTW(1, 30),
Chris@82 430 VTW(1, 31),
Chris@82 431 {TW_NEXT, VL, 0}
Chris@82 432 };
Chris@82 433
Chris@82 434 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cbdftv_32"), twinstr, &GENUS, {119, 62, 130, 0} };
Chris@82 435
Chris@82 436 void XSIMD(codelet_hc2cbdftv_32) (planner *p) {
Chris@82 437 X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);
Chris@82 438 }
Chris@82 439 #else
Chris@82 440
Chris@82 441 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include rdft/simd/hc2cbv.h */
Chris@82 442
Chris@82 443 /*
Chris@82 444 * This function contains 249 FP additions, 104 FP multiplications,
Chris@82 445 * (or, 233 additions, 88 multiplications, 16 fused multiply/add),
Chris@82 446 * 161 stack variables, 7 constants, and 64 memory accesses
Chris@82 447 */
Chris@82 448 #include "rdft/simd/hc2cbv.h"
Chris@82 449
Chris@82 450 static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 451 {
Chris@82 452 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 453 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 454 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 455 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 456 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 457 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 458 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 459 {
Chris@82 460 INT m;
Chris@82 461 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
Chris@82 462 V T1W, T21, Tf, T2c, T1t, T2r, T3T, T4m, Ty, T2q, T3P, T4n, T1n, T2d, T1T;
Chris@82 463 V T22, T1E, T24, T3I, T4p, TU, T2n, T1i, T2h, T1L, T25, T3L, T4q, T1f, T2o;
Chris@82 464 V T1j, T2k;
Chris@82 465 {
Chris@82 466 V T2, T4, T1Z, T1p, T1r, T20, T9, T1U, Td, T1V, T3, T1q, T6, T8, T7;
Chris@82 467 V Tc, Tb, Ta, T5, Te, T1o, T1s, T3R, T3S, Tj, T1N, Tw, T1Q, Tn, T1O;
Chris@82 468 V Ts, T1R, Tg, Ti, Th, Tv, Tu, Tt, Tk, Tm, Tl, Tp, Tr, Tq, To;
Chris@82 469 V Tx, T3N, T3O, T1l, T1m, T1P, T1S;
Chris@82 470 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@82 471 T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 472 T4 = VCONJ(T3);
Chris@82 473 T1Z = VADD(T2, T4);
Chris@82 474 T1p = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
Chris@82 475 T1q = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 476 T1r = VCONJ(T1q);
Chris@82 477 T20 = VADD(T1p, T1r);
Chris@82 478 T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@82 479 T7 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 480 T8 = VCONJ(T7);
Chris@82 481 T9 = VSUB(T6, T8);
Chris@82 482 T1U = VADD(T6, T8);
Chris@82 483 Tc = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
Chris@82 484 Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 485 Tb = VCONJ(Ta);
Chris@82 486 Td = VSUB(Tb, Tc);
Chris@82 487 T1V = VADD(Tb, Tc);
Chris@82 488 T1W = VSUB(T1U, T1V);
Chris@82 489 T21 = VSUB(T1Z, T20);
Chris@82 490 T5 = VSUB(T2, T4);
Chris@82 491 Te = VMUL(LDK(KP707106781), VADD(T9, Td));
Chris@82 492 Tf = VSUB(T5, Te);
Chris@82 493 T2c = VADD(T5, Te);
Chris@82 494 T1o = VMUL(LDK(KP707106781), VSUB(T9, Td));
Chris@82 495 T1s = VSUB(T1p, T1r);
Chris@82 496 T1t = VSUB(T1o, T1s);
Chris@82 497 T2r = VADD(T1s, T1o);
Chris@82 498 T3R = VADD(T1Z, T20);
Chris@82 499 T3S = VADD(T1U, T1V);
Chris@82 500 T3T = VSUB(T3R, T3S);
Chris@82 501 T4m = VADD(T3R, T3S);
Chris@82 502 Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@82 503 Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 504 Ti = VCONJ(Th);
Chris@82 505 Tj = VSUB(Tg, Ti);
Chris@82 506 T1N = VADD(Tg, Ti);
Chris@82 507 Tv = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
Chris@82 508 Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 509 Tu = VCONJ(Tt);
Chris@82 510 Tw = VSUB(Tu, Tv);
Chris@82 511 T1Q = VADD(Tu, Tv);
Chris@82 512 Tk = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
Chris@82 513 Tl = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 514 Tm = VCONJ(Tl);
Chris@82 515 Tn = VSUB(Tk, Tm);
Chris@82 516 T1O = VADD(Tk, Tm);
Chris@82 517 Tp = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@82 518 Tq = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 519 Tr = VCONJ(Tq);
Chris@82 520 Ts = VSUB(Tp, Tr);
Chris@82 521 T1R = VADD(Tp, Tr);
Chris@82 522 To = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn));
Chris@82 523 Tx = VFNMS(LDK(KP382683432), Tw, VMUL(LDK(KP923879532), Ts));
Chris@82 524 Ty = VSUB(To, Tx);
Chris@82 525 T2q = VADD(To, Tx);
Chris@82 526 T3N = VADD(T1N, T1O);
Chris@82 527 T3O = VADD(T1Q, T1R);
Chris@82 528 T3P = VSUB(T3N, T3O);
Chris@82 529 T4n = VADD(T3N, T3O);
Chris@82 530 T1l = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj));
Chris@82 531 T1m = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), Ts));
Chris@82 532 T1n = VSUB(T1l, T1m);
Chris@82 533 T2d = VADD(T1l, T1m);
Chris@82 534 T1P = VSUB(T1N, T1O);
Chris@82 535 T1S = VSUB(T1Q, T1R);
Chris@82 536 T1T = VMUL(LDK(KP707106781), VSUB(T1P, T1S));
Chris@82 537 T22 = VMUL(LDK(KP707106781), VADD(T1P, T1S));
Chris@82 538 }
Chris@82 539 {
Chris@82 540 V TD, T1B, TR, T1y, TH, T1C, TM, T1z, TA, TC, TB, TO, TQ, TP, TG;
Chris@82 541 V TF, TE, TJ, TL, TK, T1A, T1D, T3G, T3H, TN, T2f, TT, T2g, TI, TS;
Chris@82 542 V TY, T1I, T1c, T1F, T12, T1J, T17, T1G, TV, TX, TW, T1b, T1a, T19, T11;
Chris@82 543 V T10, TZ, T14, T16, T15, T1H, T1K, T3J, T3K, T18, T2i, T1e, T2j, T13, T1d;
Chris@82 544 TA = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 545 TB = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
Chris@82 546 TC = VCONJ(TB);
Chris@82 547 TD = VSUB(TA, TC);
Chris@82 548 T1B = VADD(TA, TC);
Chris@82 549 TO = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 550 TP = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
Chris@82 551 TQ = VCONJ(TP);
Chris@82 552 TR = VSUB(TO, TQ);
Chris@82 553 T1y = VADD(TO, TQ);
Chris@82 554 TG = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 555 TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@82 556 TF = VCONJ(TE);
Chris@82 557 TH = VSUB(TF, TG);
Chris@82 558 T1C = VADD(TF, TG);
Chris@82 559 TJ = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 560 TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@82 561 TL = VCONJ(TK);
Chris@82 562 TM = VSUB(TJ, TL);
Chris@82 563 T1z = VADD(TJ, TL);
Chris@82 564 T1A = VSUB(T1y, T1z);
Chris@82 565 T1D = VSUB(T1B, T1C);
Chris@82 566 T1E = VFNMS(LDK(KP382683432), T1D, VMUL(LDK(KP923879532), T1A));
Chris@82 567 T24 = VFMA(LDK(KP382683432), T1A, VMUL(LDK(KP923879532), T1D));
Chris@82 568 T3G = VADD(T1y, T1z);
Chris@82 569 T3H = VADD(T1B, T1C);
Chris@82 570 T3I = VSUB(T3G, T3H);
Chris@82 571 T4p = VADD(T3G, T3H);
Chris@82 572 TI = VMUL(LDK(KP707106781), VSUB(TD, TH));
Chris@82 573 TN = VSUB(TI, TM);
Chris@82 574 T2f = VADD(TM, TI);
Chris@82 575 TS = VMUL(LDK(KP707106781), VADD(TD, TH));
Chris@82 576 TT = VSUB(TR, TS);
Chris@82 577 T2g = VADD(TR, TS);
Chris@82 578 TU = VFMA(LDK(KP831469612), TN, VMUL(LDK(KP555570233), TT));
Chris@82 579 T2n = VFNMS(LDK(KP195090322), T2f, VMUL(LDK(KP980785280), T2g));
Chris@82 580 T1i = VFNMS(LDK(KP555570233), TN, VMUL(LDK(KP831469612), TT));
Chris@82 581 T2h = VFMA(LDK(KP980785280), T2f, VMUL(LDK(KP195090322), T2g));
Chris@82 582 TV = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 583 TW = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
Chris@82 584 TX = VCONJ(TW);
Chris@82 585 TY = VSUB(TV, TX);
Chris@82 586 T1I = VADD(TV, TX);
Chris@82 587 T1b = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 588 T19 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@82 589 T1a = VCONJ(T19);
Chris@82 590 T1c = VSUB(T1a, T1b);
Chris@82 591 T1F = VADD(T1a, T1b);
Chris@82 592 T11 = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 593 TZ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@82 594 T10 = VCONJ(TZ);
Chris@82 595 T12 = VSUB(T10, T11);
Chris@82 596 T1J = VADD(T10, T11);
Chris@82 597 T14 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 598 T15 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
Chris@82 599 T16 = VCONJ(T15);
Chris@82 600 T17 = VSUB(T14, T16);
Chris@82 601 T1G = VADD(T14, T16);
Chris@82 602 T1H = VSUB(T1F, T1G);
Chris@82 603 T1K = VSUB(T1I, T1J);
Chris@82 604 T1L = VFMA(LDK(KP923879532), T1H, VMUL(LDK(KP382683432), T1K));
Chris@82 605 T25 = VFNMS(LDK(KP382683432), T1H, VMUL(LDK(KP923879532), T1K));
Chris@82 606 T3J = VADD(T1F, T1G);
Chris@82 607 T3K = VADD(T1I, T1J);
Chris@82 608 T3L = VSUB(T3J, T3K);
Chris@82 609 T4q = VADD(T3J, T3K);
Chris@82 610 T13 = VMUL(LDK(KP707106781), VSUB(TY, T12));
Chris@82 611 T18 = VSUB(T13, T17);
Chris@82 612 T2i = VADD(T17, T13);
Chris@82 613 T1d = VMUL(LDK(KP707106781), VADD(TY, T12));
Chris@82 614 T1e = VSUB(T1c, T1d);
Chris@82 615 T2j = VADD(T1c, T1d);
Chris@82 616 T1f = VFNMS(LDK(KP555570233), T1e, VMUL(LDK(KP831469612), T18));
Chris@82 617 T2o = VFMA(LDK(KP195090322), T2i, VMUL(LDK(KP980785280), T2j));
Chris@82 618 T1j = VFMA(LDK(KP555570233), T18, VMUL(LDK(KP831469612), T1e));
Chris@82 619 T2k = VFNMS(LDK(KP195090322), T2j, VMUL(LDK(KP980785280), T2i));
Chris@82 620 }
Chris@82 621 {
Chris@82 622 V T4L, T4G, T4s, T4y, T3W, T4g, T42, T4a, T3g, T4e, T3o, T3E, T1w, T46, T2M;
Chris@82 623 V T40, T2u, T4w, T2C, T4k, T36, T3A, T3i, T3s, T28, T2O, T2w, T2G, T2Y, T4K;
Chris@82 624 V T3y, T4C;
Chris@82 625 {
Chris@82 626 V T4E, T4F, T4D, T4o, T4r, T4l, T4x, T3Q, T48, T3V, T49, T3M, T3U, T3F, T4f;
Chris@82 627 V T41, T47, T3c, T3n, T3f, T3m, T3a, T3b, T3d, T3e, T39, T4d, T3l, T3D, T1h;
Chris@82 628 V T2K, T1v, T2L, Tz, T1g, T1k, T1u, T1, T45, T2J, T3Z, T2m, T2A, T2t, T2B;
Chris@82 629 V T2e, T2l, T2p, T2s, T2b, T4v, T2z, T4j;
Chris@82 630 T4E = VADD(T4m, T4n);
Chris@82 631 T4F = VADD(T4p, T4q);
Chris@82 632 T4L = VADD(T4E, T4F);
Chris@82 633 T4D = LDW(&(W[TWVL * 30]));
Chris@82 634 T4G = VZMUL(T4D, VSUB(T4E, T4F));
Chris@82 635 T4o = VSUB(T4m, T4n);
Chris@82 636 T4r = VBYI(VSUB(T4p, T4q));
Chris@82 637 T4l = LDW(&(W[TWVL * 46]));
Chris@82 638 T4s = VZMUL(T4l, VSUB(T4o, T4r));
Chris@82 639 T4x = LDW(&(W[TWVL * 14]));
Chris@82 640 T4y = VZMUL(T4x, VADD(T4o, T4r));
Chris@82 641 T3M = VMUL(LDK(KP707106781), VSUB(T3I, T3L));
Chris@82 642 T3Q = VBYI(VSUB(T3M, T3P));
Chris@82 643 T48 = VBYI(VADD(T3P, T3M));
Chris@82 644 T3U = VMUL(LDK(KP707106781), VADD(T3I, T3L));
Chris@82 645 T3V = VSUB(T3T, T3U);
Chris@82 646 T49 = VADD(T3T, T3U);
Chris@82 647 T3F = LDW(&(W[TWVL * 22]));
Chris@82 648 T3W = VZMUL(T3F, VADD(T3Q, T3V));
Chris@82 649 T4f = LDW(&(W[TWVL * 54]));
Chris@82 650 T4g = VZMUL(T4f, VSUB(T49, T48));
Chris@82 651 T41 = LDW(&(W[TWVL * 38]));
Chris@82 652 T42 = VZMUL(T41, VSUB(T3V, T3Q));
Chris@82 653 T47 = LDW(&(W[TWVL * 6]));
Chris@82 654 T4a = VZMUL(T47, VADD(T48, T49));
Chris@82 655 T3a = VADD(T1t, T1n);
Chris@82 656 T3b = VADD(TU, T1f);
Chris@82 657 T3c = VBYI(VADD(T3a, T3b));
Chris@82 658 T3n = VBYI(VSUB(T3b, T3a));
Chris@82 659 T3d = VADD(Tf, Ty);
Chris@82 660 T3e = VADD(T1i, T1j);
Chris@82 661 T3f = VADD(T3d, T3e);
Chris@82 662 T3m = VSUB(T3d, T3e);
Chris@82 663 T39 = LDW(&(W[TWVL * 4]));
Chris@82 664 T3g = VZMULI(T39, VADD(T3c, T3f));
Chris@82 665 T4d = LDW(&(W[TWVL * 56]));
Chris@82 666 T4e = VZMULI(T4d, VSUB(T3f, T3c));
Chris@82 667 T3l = LDW(&(W[TWVL * 36]));
Chris@82 668 T3o = VZMULI(T3l, VSUB(T3m, T3n));
Chris@82 669 T3D = LDW(&(W[TWVL * 24]));
Chris@82 670 T3E = VZMULI(T3D, VADD(T3n, T3m));
Chris@82 671 Tz = VSUB(Tf, Ty);
Chris@82 672 T1g = VSUB(TU, T1f);
Chris@82 673 T1h = VSUB(Tz, T1g);
Chris@82 674 T2K = VADD(Tz, T1g);
Chris@82 675 T1k = VSUB(T1i, T1j);
Chris@82 676 T1u = VSUB(T1n, T1t);
Chris@82 677 T1v = VBYI(VSUB(T1k, T1u));
Chris@82 678 T2L = VBYI(VADD(T1u, T1k));
Chris@82 679 T1 = LDW(&(W[TWVL * 20]));
Chris@82 680 T1w = VZMULI(T1, VADD(T1h, T1v));
Chris@82 681 T45 = LDW(&(W[TWVL * 8]));
Chris@82 682 T46 = VZMULI(T45, VADD(T2K, T2L));
Chris@82 683 T2J = LDW(&(W[TWVL * 52]));
Chris@82 684 T2M = VZMULI(T2J, VSUB(T2K, T2L));
Chris@82 685 T3Z = LDW(&(W[TWVL * 40]));
Chris@82 686 T40 = VZMULI(T3Z, VSUB(T1h, T1v));
Chris@82 687 T2e = VSUB(T2c, T2d);
Chris@82 688 T2l = VSUB(T2h, T2k);
Chris@82 689 T2m = VSUB(T2e, T2l);
Chris@82 690 T2A = VADD(T2e, T2l);
Chris@82 691 T2p = VSUB(T2n, T2o);
Chris@82 692 T2s = VSUB(T2q, T2r);
Chris@82 693 T2t = VBYI(VSUB(T2p, T2s));
Chris@82 694 T2B = VBYI(VADD(T2s, T2p));
Chris@82 695 T2b = LDW(&(W[TWVL * 44]));
Chris@82 696 T2u = VZMULI(T2b, VSUB(T2m, T2t));
Chris@82 697 T4v = LDW(&(W[TWVL * 16]));
Chris@82 698 T4w = VZMULI(T4v, VADD(T2m, T2t));
Chris@82 699 T2z = LDW(&(W[TWVL * 12]));
Chris@82 700 T2C = VZMULI(T2z, VADD(T2A, T2B));
Chris@82 701 T4j = LDW(&(W[TWVL * 48]));
Chris@82 702 T4k = VZMULI(T4j, VSUB(T2A, T2B));
Chris@82 703 {
Chris@82 704 V T32, T3q, T35, T3r, T30, T31, T33, T34, T2Z, T3z, T3h, T3p, T1Y, T2E, T27;
Chris@82 705 V T2F, T1M, T1X, T23, T26, T1x, T2N, T2v, T2D, T2U, T3x, T2X, T3w, T2S, T2T;
Chris@82 706 V T2V, T2W, T2R, T4J, T3v, T4B;
Chris@82 707 T30 = VADD(T21, T22);
Chris@82 708 T31 = VADD(T1E, T1L);
Chris@82 709 T32 = VADD(T30, T31);
Chris@82 710 T3q = VSUB(T30, T31);
Chris@82 711 T33 = VADD(T1W, T1T);
Chris@82 712 T34 = VADD(T24, T25);
Chris@82 713 T35 = VBYI(VADD(T33, T34));
Chris@82 714 T3r = VBYI(VSUB(T34, T33));
Chris@82 715 T2Z = LDW(&(W[TWVL * 58]));
Chris@82 716 T36 = VZMUL(T2Z, VSUB(T32, T35));
Chris@82 717 T3z = LDW(&(W[TWVL * 26]));
Chris@82 718 T3A = VZMUL(T3z, VADD(T3q, T3r));
Chris@82 719 T3h = LDW(&(W[TWVL * 2]));
Chris@82 720 T3i = VZMUL(T3h, VADD(T32, T35));
Chris@82 721 T3p = LDW(&(W[TWVL * 34]));
Chris@82 722 T3s = VZMUL(T3p, VSUB(T3q, T3r));
Chris@82 723 T1M = VSUB(T1E, T1L);
Chris@82 724 T1X = VSUB(T1T, T1W);
Chris@82 725 T1Y = VBYI(VSUB(T1M, T1X));
Chris@82 726 T2E = VBYI(VADD(T1X, T1M));
Chris@82 727 T23 = VSUB(T21, T22);
Chris@82 728 T26 = VSUB(T24, T25);
Chris@82 729 T27 = VSUB(T23, T26);
Chris@82 730 T2F = VADD(T23, T26);
Chris@82 731 T1x = LDW(&(W[TWVL * 18]));
Chris@82 732 T28 = VZMUL(T1x, VADD(T1Y, T27));
Chris@82 733 T2N = LDW(&(W[TWVL * 50]));
Chris@82 734 T2O = VZMUL(T2N, VSUB(T2F, T2E));
Chris@82 735 T2v = LDW(&(W[TWVL * 42]));
Chris@82 736 T2w = VZMUL(T2v, VSUB(T27, T1Y));
Chris@82 737 T2D = LDW(&(W[TWVL * 10]));
Chris@82 738 T2G = VZMUL(T2D, VADD(T2E, T2F));
Chris@82 739 T2S = VADD(T2c, T2d);
Chris@82 740 T2T = VADD(T2n, T2o);
Chris@82 741 T2U = VADD(T2S, T2T);
Chris@82 742 T3x = VSUB(T2S, T2T);
Chris@82 743 T2V = VADD(T2r, T2q);
Chris@82 744 T2W = VADD(T2h, T2k);
Chris@82 745 T2X = VBYI(VADD(T2V, T2W));
Chris@82 746 T3w = VBYI(VSUB(T2W, T2V));
Chris@82 747 T2R = LDW(&(W[TWVL * 60]));
Chris@82 748 T2Y = VZMULI(T2R, VSUB(T2U, T2X));
Chris@82 749 T4J = LDW(&(W[0]));
Chris@82 750 T4K = VZMULI(T4J, VADD(T2X, T2U));
Chris@82 751 T3v = LDW(&(W[TWVL * 28]));
Chris@82 752 T3y = VZMULI(T3v, VADD(T3w, T3x));
Chris@82 753 T4B = LDW(&(W[TWVL * 32]));
Chris@82 754 T4C = VZMULI(T4B, VSUB(T3x, T3w));
Chris@82 755 }
Chris@82 756 }
Chris@82 757 {
Chris@82 758 V T29, T4M, T2P, T4t, T4N, T2a, T4u, T2Q, T2x, T4H, T2H, T4z, T4I, T2y, T4A;
Chris@82 759 V T2I, T37, T4h, T3B, T3X, T4i, T38, T3Y, T3C, T3j, T4b, T3t, T43, T4c, T3k;
Chris@82 760 V T44, T3u;
Chris@82 761 T29 = VADD(T1w, T28);
Chris@82 762 ST(&(Rp[WS(rs, 5)]), T29, ms, &(Rp[WS(rs, 1)]));
Chris@82 763 T4M = VADD(T4K, T4L);
Chris@82 764 ST(&(Rp[0]), T4M, ms, &(Rp[0]));
Chris@82 765 T2P = VADD(T2M, T2O);
Chris@82 766 ST(&(Rp[WS(rs, 13)]), T2P, ms, &(Rp[WS(rs, 1)]));
Chris@82 767 T4t = VADD(T4k, T4s);
Chris@82 768 ST(&(Rp[WS(rs, 12)]), T4t, ms, &(Rp[0]));
Chris@82 769 T4N = VCONJ(VSUB(T4L, T4K));
Chris@82 770 ST(&(Rm[0]), T4N, -ms, &(Rm[0]));
Chris@82 771 T2a = VCONJ(VSUB(T28, T1w));
Chris@82 772 ST(&(Rm[WS(rs, 5)]), T2a, -ms, &(Rm[WS(rs, 1)]));
Chris@82 773 T4u = VCONJ(VSUB(T4s, T4k));
Chris@82 774 ST(&(Rm[WS(rs, 12)]), T4u, -ms, &(Rm[0]));
Chris@82 775 T2Q = VCONJ(VSUB(T2O, T2M));
Chris@82 776 ST(&(Rm[WS(rs, 13)]), T2Q, -ms, &(Rm[WS(rs, 1)]));
Chris@82 777 T2x = VADD(T2u, T2w);
Chris@82 778 ST(&(Rp[WS(rs, 11)]), T2x, ms, &(Rp[WS(rs, 1)]));
Chris@82 779 T4H = VADD(T4C, T4G);
Chris@82 780 ST(&(Rp[WS(rs, 8)]), T4H, ms, &(Rp[0]));
Chris@82 781 T2H = VADD(T2C, T2G);
Chris@82 782 ST(&(Rp[WS(rs, 3)]), T2H, ms, &(Rp[WS(rs, 1)]));
Chris@82 783 T4z = VADD(T4w, T4y);
Chris@82 784 ST(&(Rp[WS(rs, 4)]), T4z, ms, &(Rp[0]));
Chris@82 785 T4I = VCONJ(VSUB(T4G, T4C));
Chris@82 786 ST(&(Rm[WS(rs, 8)]), T4I, -ms, &(Rm[0]));
Chris@82 787 T2y = VCONJ(VSUB(T2w, T2u));
Chris@82 788 ST(&(Rm[WS(rs, 11)]), T2y, -ms, &(Rm[WS(rs, 1)]));
Chris@82 789 T4A = VCONJ(VSUB(T4y, T4w));
Chris@82 790 ST(&(Rm[WS(rs, 4)]), T4A, -ms, &(Rm[0]));
Chris@82 791 T2I = VCONJ(VSUB(T2G, T2C));
Chris@82 792 ST(&(Rm[WS(rs, 3)]), T2I, -ms, &(Rm[WS(rs, 1)]));
Chris@82 793 T37 = VADD(T2Y, T36);
Chris@82 794 ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
Chris@82 795 T4h = VADD(T4e, T4g);
Chris@82 796 ST(&(Rp[WS(rs, 14)]), T4h, ms, &(Rp[0]));
Chris@82 797 T3B = VADD(T3y, T3A);
Chris@82 798 ST(&(Rp[WS(rs, 7)]), T3B, ms, &(Rp[WS(rs, 1)]));
Chris@82 799 T3X = VADD(T3E, T3W);
Chris@82 800 ST(&(Rp[WS(rs, 6)]), T3X, ms, &(Rp[0]));
Chris@82 801 T4i = VCONJ(VSUB(T4g, T4e));
Chris@82 802 ST(&(Rm[WS(rs, 14)]), T4i, -ms, &(Rm[0]));
Chris@82 803 T38 = VCONJ(VSUB(T36, T2Y));
Chris@82 804 ST(&(Rm[WS(rs, 15)]), T38, -ms, &(Rm[WS(rs, 1)]));
Chris@82 805 T3Y = VCONJ(VSUB(T3W, T3E));
Chris@82 806 ST(&(Rm[WS(rs, 6)]), T3Y, -ms, &(Rm[0]));
Chris@82 807 T3C = VCONJ(VSUB(T3A, T3y));
Chris@82 808 ST(&(Rm[WS(rs, 7)]), T3C, -ms, &(Rm[WS(rs, 1)]));
Chris@82 809 T3j = VADD(T3g, T3i);
Chris@82 810 ST(&(Rp[WS(rs, 1)]), T3j, ms, &(Rp[WS(rs, 1)]));
Chris@82 811 T4b = VADD(T46, T4a);
Chris@82 812 ST(&(Rp[WS(rs, 2)]), T4b, ms, &(Rp[0]));
Chris@82 813 T3t = VADD(T3o, T3s);
Chris@82 814 ST(&(Rp[WS(rs, 9)]), T3t, ms, &(Rp[WS(rs, 1)]));
Chris@82 815 T43 = VADD(T40, T42);
Chris@82 816 ST(&(Rp[WS(rs, 10)]), T43, ms, &(Rp[0]));
Chris@82 817 T4c = VCONJ(VSUB(T4a, T46));
Chris@82 818 ST(&(Rm[WS(rs, 2)]), T4c, -ms, &(Rm[0]));
Chris@82 819 T3k = VCONJ(VSUB(T3i, T3g));
Chris@82 820 ST(&(Rm[WS(rs, 1)]), T3k, -ms, &(Rm[WS(rs, 1)]));
Chris@82 821 T44 = VCONJ(VSUB(T42, T40));
Chris@82 822 ST(&(Rm[WS(rs, 10)]), T44, -ms, &(Rm[0]));
Chris@82 823 T3u = VCONJ(VSUB(T3s, T3o));
Chris@82 824 ST(&(Rm[WS(rs, 9)]), T3u, -ms, &(Rm[WS(rs, 1)]));
Chris@82 825 }
Chris@82 826 }
Chris@82 827 }
Chris@82 828 }
Chris@82 829 VLEAVE();
Chris@82 830 }
Chris@82 831
Chris@82 832 static const tw_instr twinstr[] = {
Chris@82 833 VTW(1, 1),
Chris@82 834 VTW(1, 2),
Chris@82 835 VTW(1, 3),
Chris@82 836 VTW(1, 4),
Chris@82 837 VTW(1, 5),
Chris@82 838 VTW(1, 6),
Chris@82 839 VTW(1, 7),
Chris@82 840 VTW(1, 8),
Chris@82 841 VTW(1, 9),
Chris@82 842 VTW(1, 10),
Chris@82 843 VTW(1, 11),
Chris@82 844 VTW(1, 12),
Chris@82 845 VTW(1, 13),
Chris@82 846 VTW(1, 14),
Chris@82 847 VTW(1, 15),
Chris@82 848 VTW(1, 16),
Chris@82 849 VTW(1, 17),
Chris@82 850 VTW(1, 18),
Chris@82 851 VTW(1, 19),
Chris@82 852 VTW(1, 20),
Chris@82 853 VTW(1, 21),
Chris@82 854 VTW(1, 22),
Chris@82 855 VTW(1, 23),
Chris@82 856 VTW(1, 24),
Chris@82 857 VTW(1, 25),
Chris@82 858 VTW(1, 26),
Chris@82 859 VTW(1, 27),
Chris@82 860 VTW(1, 28),
Chris@82 861 VTW(1, 29),
Chris@82 862 VTW(1, 30),
Chris@82 863 VTW(1, 31),
Chris@82 864 {TW_NEXT, VL, 0}
Chris@82 865 };
Chris@82 866
Chris@82 867 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cbdftv_32"), twinstr, &GENUS, {233, 88, 16, 0} };
Chris@82 868
Chris@82 869 void XSIMD(codelet_hc2cbdftv_32) (planner *p) {
Chris@82 870 X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);
Chris@82 871 }
Chris@82 872 #endif