annotate src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:42:30 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-rdft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include hc2cbv.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 249 FP additions, 192 FP multiplications,
Chris@10 32 * (or, 119 additions, 62 multiplications, 130 fused multiply/add),
Chris@10 33 * 166 stack variables, 7 constants, and 64 memory accesses
Chris@10 34 */
Chris@10 35 #include "hc2cbv.h"
Chris@10 36
Chris@10 37 static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 40 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 41 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 46 {
Chris@10 47 INT m;
Chris@10 48 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
Chris@10 49 V T3a, T3N;
Chris@10 50 {
Chris@10 51 V T2G, T1o, T2o, T2Y, T1b, T1V, Ts, T1S, T3A, T48, T3p, T45, T31, T2z, T2H;
Chris@10 52 V T1L, Tv, TG, TM, T3q, T1r, TX, TN, T1s, Ty, T1t, TB, TO, TQ, T1y;
Chris@10 53 V T3t, TR, T1H, T1K, TV, T1p, T1q, T1w, TW, Tt, Tu, TE, TF, TK, TL;
Chris@10 54 V Tw, Tx, Tz, TA, T1x;
Chris@10 55 {
Chris@10 56 V T1i, T4, T1j, T15, T1l, T1m, Tb, T16, Tf, T1G, Ti, T1F, Tm, T1J, T1I;
Chris@10 57 V Tp, T2, T3, T13, T14, T5, T6, T8, T9, Td, T7, Ta, Te, Tg, Th;
Chris@10 58 V Tk, Tl, Tn, To, T2m, Tc, T3l, T1k, T3m, T18, Tj, T3y, T1n, Tq, T19;
Chris@10 59 V T3n, T17, T2x, T1a, T2n, T2y, Tr, T3z, T3o;
Chris@10 60 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@10 61 T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 62 T13 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
Chris@10 63 T14 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 64 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@10 65 T6 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 66 T8 = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
Chris@10 67 T9 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 68 Td = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
Chris@10 69 T1i = VFMACONJ(T3, T2);
Chris@10 70 T4 = VFNMSCONJ(T3, T2);
Chris@10 71 T1j = VFMACONJ(T14, T13);
Chris@10 72 T15 = VFNMSCONJ(T14, T13);
Chris@10 73 T1l = VFMACONJ(T6, T5);
Chris@10 74 T7 = VFNMSCONJ(T6, T5);
Chris@10 75 T1m = VFMACONJ(T9, T8);
Chris@10 76 Ta = VFMSCONJ(T9, T8);
Chris@10 77 Te = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 78 Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@10 79 Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 80 Tk = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@10 81 Tl = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 82 Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
Chris@10 83 To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 84 Tb = VADD(T7, Ta);
Chris@10 85 T16 = VSUB(T7, Ta);
Chris@10 86 Tf = VFNMSCONJ(Te, Td);
Chris@10 87 T1G = VFMACONJ(Te, Td);
Chris@10 88 Ti = VFNMSCONJ(Th, Tg);
Chris@10 89 T1F = VFMACONJ(Th, Tg);
Chris@10 90 Tm = VFNMSCONJ(Tl, Tk);
Chris@10 91 T1J = VFMACONJ(Tl, Tk);
Chris@10 92 T1I = VFMACONJ(To, Tn);
Chris@10 93 Tp = VFMSCONJ(To, Tn);
Chris@10 94 T2m = VFMA(LDK(KP707106781), Tb, T4);
Chris@10 95 Tc = VFNMS(LDK(KP707106781), Tb, T4);
Chris@10 96 T3l = VSUB(T1i, T1j);
Chris@10 97 T1k = VADD(T1i, T1j);
Chris@10 98 T1H = VADD(T1F, T1G);
Chris@10 99 T3m = VSUB(T1F, T1G);
Chris@10 100 T18 = VFNMS(LDK(KP414213562), Tf, Ti);
Chris@10 101 Tj = VFMA(LDK(KP414213562), Ti, Tf);
Chris@10 102 T3y = VSUB(T1l, T1m);
Chris@10 103 T1n = VADD(T1l, T1m);
Chris@10 104 Tq = VFNMS(LDK(KP414213562), Tp, Tm);
Chris@10 105 T19 = VFMA(LDK(KP414213562), Tm, Tp);
Chris@10 106 T1K = VADD(T1I, T1J);
Chris@10 107 T3n = VSUB(T1I, T1J);
Chris@10 108 T17 = VFNMS(LDK(KP707106781), T16, T15);
Chris@10 109 T2x = VFMA(LDK(KP707106781), T16, T15);
Chris@10 110 T1a = VSUB(T18, T19);
Chris@10 111 T2n = VADD(T18, T19);
Chris@10 112 T2y = VADD(Tj, Tq);
Chris@10 113 Tr = VSUB(Tj, Tq);
Chris@10 114 T3z = VSUB(T3m, T3n);
Chris@10 115 T3o = VADD(T3m, T3n);
Chris@10 116 T2G = VADD(T1k, T1n);
Chris@10 117 T1o = VSUB(T1k, T1n);
Chris@10 118 T2o = VFNMS(LDK(KP923879532), T2n, T2m);
Chris@10 119 T2Y = VFMA(LDK(KP923879532), T2n, T2m);
Chris@10 120 T1b = VFNMS(LDK(KP923879532), T1a, T17);
Chris@10 121 T1V = VFMA(LDK(KP923879532), T1a, T17);
Chris@10 122 Ts = VFMA(LDK(KP923879532), Tr, Tc);
Chris@10 123 T1S = VFNMS(LDK(KP923879532), Tr, Tc);
Chris@10 124 T3A = VFMA(LDK(KP707106781), T3z, T3y);
Chris@10 125 T48 = VFNMS(LDK(KP707106781), T3z, T3y);
Chris@10 126 T3p = VFMA(LDK(KP707106781), T3o, T3l);
Chris@10 127 T45 = VFNMS(LDK(KP707106781), T3o, T3l);
Chris@10 128 T31 = VFMA(LDK(KP923879532), T2y, T2x);
Chris@10 129 T2z = VFNMS(LDK(KP923879532), T2y, T2x);
Chris@10 130 }
Chris@10 131 Tt = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 132 Tu = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
Chris@10 133 TE = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 134 TF = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@10 135 TK = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 136 TL = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@10 137 TV = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 138 T2H = VADD(T1H, T1K);
Chris@10 139 T1L = VSUB(T1H, T1K);
Chris@10 140 Tv = VFNMSCONJ(Tu, Tt);
Chris@10 141 T1p = VFMACONJ(Tu, Tt);
Chris@10 142 TG = VFNMSCONJ(TF, TE);
Chris@10 143 T1q = VFMACONJ(TF, TE);
Chris@10 144 T1w = VFMACONJ(TL, TK);
Chris@10 145 TM = VFMSCONJ(TL, TK);
Chris@10 146 TW = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
Chris@10 147 Tw = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 148 Tx = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
Chris@10 149 Tz = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 150 TA = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@10 151 T3q = VSUB(T1p, T1q);
Chris@10 152 T1r = VADD(T1p, T1q);
Chris@10 153 T1x = VFMACONJ(TW, TV);
Chris@10 154 TX = VFNMSCONJ(TW, TV);
Chris@10 155 TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 156 T1s = VFMACONJ(Tx, Tw);
Chris@10 157 Ty = VFNMSCONJ(Tx, Tw);
Chris@10 158 T1t = VFMACONJ(TA, Tz);
Chris@10 159 TB = VFMSCONJ(TA, Tz);
Chris@10 160 TO = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
Chris@10 161 TQ = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 162 T1y = VADD(T1w, T1x);
Chris@10 163 T3t = VSUB(T1w, T1x);
Chris@10 164 TR = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@10 165 {
Chris@10 166 V T38, T3f, T4p, T4v, T3T, T3Z, T2a, T2i, T4b, T4h, T1O, T20, T2M, T2U, T3F;
Chris@10 167 V T3L, T2g, T3X, T3J, T1g, T4f, T2S, T4l, T2E, T2X, T3O, T3b, T3i, T26, T4t;
Chris@10 168 V T43, T1Y, T3c, T30, T3d, T33;
Chris@10 169 {
Chris@10 170 V T2I, T2A, T2r, T1c, TJ, T2L, T2u, T2B, T10, T1d, T3x, T3E, T1E, T1N, T1h;
Chris@10 171 V T1Z, T4m, T1M, T1D, T4a, T4o, T4n, T47, T4u, T3R, T3S, T3Q, T3Y, T28, T29;
Chris@10 172 V T27, T2h, T44, T4g;
Chris@10 173 {
Chris@10 174 V T36, T1v, T2J, T3s, T3B, T2p, TI, T2q, TD, T1B, T3u, TY, TT, T35, T1u;
Chris@10 175 V T3r, TH, TC, T1z, TP, T1A, TS, T3w, T3D, T1C, T2K, T3v, T3C, T2s, TZ;
Chris@10 176 V T2t, TU, T37, T49, T46;
Chris@10 177 T2I = VSUB(T2G, T2H);
Chris@10 178 T36 = VADD(T2G, T2H);
Chris@10 179 T1u = VADD(T1s, T1t);
Chris@10 180 T3r = VSUB(T1s, T1t);
Chris@10 181 TH = VSUB(Ty, TB);
Chris@10 182 TC = VADD(Ty, TB);
Chris@10 183 T1z = VFMACONJ(TO, TN);
Chris@10 184 TP = VFNMSCONJ(TO, TN);
Chris@10 185 T1A = VFMACONJ(TR, TQ);
Chris@10 186 TS = VFMSCONJ(TR, TQ);
Chris@10 187 T1v = VSUB(T1r, T1u);
Chris@10 188 T2J = VADD(T1r, T1u);
Chris@10 189 T3s = VFNMS(LDK(KP414213562), T3r, T3q);
Chris@10 190 T3B = VFMA(LDK(KP414213562), T3q, T3r);
Chris@10 191 T2p = VFMA(LDK(KP707106781), TH, TG);
Chris@10 192 TI = VFNMS(LDK(KP707106781), TH, TG);
Chris@10 193 T2q = VFMA(LDK(KP707106781), TC, Tv);
Chris@10 194 TD = VFNMS(LDK(KP707106781), TC, Tv);
Chris@10 195 T1B = VADD(T1z, T1A);
Chris@10 196 T3u = VSUB(T1A, T1z);
Chris@10 197 TY = VSUB(TS, TP);
Chris@10 198 TT = VADD(TP, TS);
Chris@10 199 T35 = LDW(&(W[TWVL * 30]));
Chris@10 200 T4m = LDW(&(W[TWVL * 10]));
Chris@10 201 T2A = VFNMS(LDK(KP198912367), T2p, T2q);
Chris@10 202 T2r = VFMA(LDK(KP198912367), T2q, T2p);
Chris@10 203 T1c = VFNMS(LDK(KP668178637), TD, TI);
Chris@10 204 TJ = VFMA(LDK(KP668178637), TI, TD);
Chris@10 205 T1C = VSUB(T1y, T1B);
Chris@10 206 T2K = VADD(T1y, T1B);
Chris@10 207 T3v = VFNMS(LDK(KP414213562), T3u, T3t);
Chris@10 208 T3C = VFMA(LDK(KP414213562), T3t, T3u);
Chris@10 209 T2s = VFNMS(LDK(KP707106781), TY, TX);
Chris@10 210 TZ = VFMA(LDK(KP707106781), TY, TX);
Chris@10 211 T2t = VFMA(LDK(KP707106781), TT, TM);
Chris@10 212 TU = VFNMS(LDK(KP707106781), TT, TM);
Chris@10 213 T1M = VSUB(T1v, T1C);
Chris@10 214 T1D = VADD(T1v, T1C);
Chris@10 215 T37 = VADD(T2J, T2K);
Chris@10 216 T2L = VSUB(T2J, T2K);
Chris@10 217 T3w = VADD(T3s, T3v);
Chris@10 218 T49 = VSUB(T3s, T3v);
Chris@10 219 T3D = VSUB(T3B, T3C);
Chris@10 220 T46 = VADD(T3B, T3C);
Chris@10 221 T2u = VFNMS(LDK(KP198912367), T2t, T2s);
Chris@10 222 T2B = VFMA(LDK(KP198912367), T2s, T2t);
Chris@10 223 T10 = VFNMS(LDK(KP668178637), TZ, TU);
Chris@10 224 T1d = VFMA(LDK(KP668178637), TU, TZ);
Chris@10 225 T38 = VZMUL(T35, VSUB(T36, T37));
Chris@10 226 T3f = VADD(T36, T37);
Chris@10 227 T4a = VFMA(LDK(KP923879532), T49, T48);
Chris@10 228 T4o = VFNMS(LDK(KP923879532), T49, T48);
Chris@10 229 T4n = VFMA(LDK(KP923879532), T46, T45);
Chris@10 230 T47 = VFNMS(LDK(KP923879532), T46, T45);
Chris@10 231 T4u = LDW(&(W[TWVL * 50]));
Chris@10 232 T3R = VFMA(LDK(KP923879532), T3w, T3p);
Chris@10 233 T3x = VFNMS(LDK(KP923879532), T3w, T3p);
Chris@10 234 T3E = VFNMS(LDK(KP923879532), T3D, T3A);
Chris@10 235 T3S = VFMA(LDK(KP923879532), T3D, T3A);
Chris@10 236 T3Q = LDW(&(W[TWVL * 58]));
Chris@10 237 T3Y = LDW(&(W[TWVL * 2]));
Chris@10 238 }
Chris@10 239 T28 = VFMA(LDK(KP707106781), T1D, T1o);
Chris@10 240 T1E = VFNMS(LDK(KP707106781), T1D, T1o);
Chris@10 241 T1N = VFNMS(LDK(KP707106781), T1M, T1L);
Chris@10 242 T29 = VFMA(LDK(KP707106781), T1M, T1L);
Chris@10 243 T4p = VZMUL(T4m, VFNMSI(T4o, T4n));
Chris@10 244 T4v = VZMUL(T4u, VFMAI(T4o, T4n));
Chris@10 245 T27 = LDW(&(W[TWVL * 6]));
Chris@10 246 T2h = LDW(&(W[TWVL * 54]));
Chris@10 247 T3T = VZMUL(T3Q, VFNMSI(T3S, T3R));
Chris@10 248 T3Z = VZMUL(T3Y, VFMAI(T3S, T3R));
Chris@10 249 T44 = LDW(&(W[TWVL * 18]));
Chris@10 250 T4g = LDW(&(W[TWVL * 42]));
Chris@10 251 T2a = VZMUL(T27, VFMAI(T29, T28));
Chris@10 252 T2i = VZMUL(T2h, VFNMSI(T29, T28));
Chris@10 253 T1h = LDW(&(W[TWVL * 22]));
Chris@10 254 T1Z = LDW(&(W[TWVL * 38]));
Chris@10 255 T4b = VZMUL(T44, VFMAI(T4a, T47));
Chris@10 256 T4h = VZMUL(T4g, VFNMSI(T4a, T47));
Chris@10 257 {
Chris@10 258 V T1W, T1T, T1, T3W, T2d, T3I, T2e, T12, T2f, T1f, T2F, T2T, T3k, T3K, T11;
Chris@10 259 V T1e, T32, T2Z, T2l, T4k, T2P, T4e, T2Q, T2w, T2R, T2D, T2v, T2C, T1R, T4s;
Chris@10 260 V T23, T42, T24, T1U, T25, T1X;
Chris@10 261 T2F = LDW(&(W[TWVL * 46]));
Chris@10 262 T2T = LDW(&(W[TWVL * 14]));
Chris@10 263 T1O = VZMUL(T1h, VFNMSI(T1N, T1E));
Chris@10 264 T20 = VZMUL(T1Z, VFMAI(T1N, T1E));
Chris@10 265 T3k = LDW(&(W[TWVL * 26]));
Chris@10 266 T3K = LDW(&(W[TWVL * 34]));
Chris@10 267 T2M = VZMUL(T2F, VFNMSI(T2L, T2I));
Chris@10 268 T2U = VZMUL(T2T, VFMAI(T2L, T2I));
Chris@10 269 T11 = VADD(TJ, T10);
Chris@10 270 T1W = VSUB(TJ, T10);
Chris@10 271 T1T = VSUB(T1d, T1c);
Chris@10 272 T1e = VADD(T1c, T1d);
Chris@10 273 T1 = LDW(&(W[TWVL * 24]));
Chris@10 274 T3W = LDW(&(W[TWVL * 4]));
Chris@10 275 T3F = VZMUL(T3k, VFNMSI(T3E, T3x));
Chris@10 276 T3L = VZMUL(T3K, VFMAI(T3E, T3x));
Chris@10 277 T2d = LDW(&(W[TWVL * 56]));
Chris@10 278 T3I = LDW(&(W[TWVL * 36]));
Chris@10 279 T2e = VFMA(LDK(KP831469612), T11, Ts);
Chris@10 280 T12 = VFNMS(LDK(KP831469612), T11, Ts);
Chris@10 281 T2f = VFMA(LDK(KP831469612), T1e, T1b);
Chris@10 282 T1f = VFNMS(LDK(KP831469612), T1e, T1b);
Chris@10 283 T2v = VSUB(T2r, T2u);
Chris@10 284 T32 = VADD(T2r, T2u);
Chris@10 285 T2Z = VADD(T2A, T2B);
Chris@10 286 T2C = VSUB(T2A, T2B);
Chris@10 287 T2l = LDW(&(W[TWVL * 48]));
Chris@10 288 T4k = LDW(&(W[TWVL * 12]));
Chris@10 289 T2P = LDW(&(W[TWVL * 16]));
Chris@10 290 T4e = LDW(&(W[TWVL * 44]));
Chris@10 291 T2g = VZMULI(T2d, VFMAI(T2f, T2e));
Chris@10 292 T3X = VZMULI(T3W, VFNMSI(T2f, T2e));
Chris@10 293 T3J = VZMULI(T3I, VFNMSI(T1f, T12));
Chris@10 294 T1g = VZMULI(T1, VFMAI(T1f, T12));
Chris@10 295 T2Q = VFNMS(LDK(KP980785280), T2v, T2o);
Chris@10 296 T2w = VFMA(LDK(KP980785280), T2v, T2o);
Chris@10 297 T2R = VFMA(LDK(KP980785280), T2C, T2z);
Chris@10 298 T2D = VFNMS(LDK(KP980785280), T2C, T2z);
Chris@10 299 T1R = LDW(&(W[TWVL * 40]));
Chris@10 300 T4s = LDW(&(W[TWVL * 52]));
Chris@10 301 T23 = LDW(&(W[TWVL * 8]));
Chris@10 302 T42 = LDW(&(W[TWVL * 20]));
Chris@10 303 T4f = VZMULI(T4e, VFNMSI(T2R, T2Q));
Chris@10 304 T2S = VZMULI(T2P, VFMAI(T2R, T2Q));
Chris@10 305 T4l = VZMULI(T4k, VFNMSI(T2D, T2w));
Chris@10 306 T2E = VZMULI(T2l, VFMAI(T2D, T2w));
Chris@10 307 T24 = VFMA(LDK(KP831469612), T1T, T1S);
Chris@10 308 T1U = VFNMS(LDK(KP831469612), T1T, T1S);
Chris@10 309 T25 = VFMA(LDK(KP831469612), T1W, T1V);
Chris@10 310 T1X = VFNMS(LDK(KP831469612), T1W, T1V);
Chris@10 311 T2X = LDW(&(W[TWVL * 32]));
Chris@10 312 T3O = LDW(&(W[TWVL * 60]));
Chris@10 313 T3b = LDW(&(W[0]));
Chris@10 314 T3i = LDW(&(W[TWVL * 28]));
Chris@10 315 T26 = VZMULI(T23, VFMAI(T25, T24));
Chris@10 316 T4t = VZMULI(T4s, VFNMSI(T25, T24));
Chris@10 317 T43 = VZMULI(T42, VFNMSI(T1X, T1U));
Chris@10 318 T1Y = VZMULI(T1R, VFMAI(T1X, T1U));
Chris@10 319 T3c = VFMA(LDK(KP980785280), T2Z, T2Y);
Chris@10 320 T30 = VFNMS(LDK(KP980785280), T2Z, T2Y);
Chris@10 321 T3d = VFMA(LDK(KP980785280), T32, T31);
Chris@10 322 T33 = VFNMS(LDK(KP980785280), T32, T31);
Chris@10 323 }
Chris@10 324 }
Chris@10 325 {
Chris@10 326 V T3e, T3P, T3j, T34, T2c, T4j, T2k, T4d, T1P, T1Q, T4x, T4w, T2j, T4c, T21;
Chris@10 327 V T22, T4r, T4q, T2b, T4i, T3h, T3H, T2N, T2O, T41, T40, T3g, T3G, T2V, T2W;
Chris@10 328 V T3V, T3U, T39, T3M;
Chris@10 329 T1P = VADD(T1g, T1O);
Chris@10 330 T1Q = VCONJ(VSUB(T1O, T1g));
Chris@10 331 T4x = VCONJ(VSUB(T4v, T4t));
Chris@10 332 T4w = VADD(T4t, T4v);
Chris@10 333 T2j = VADD(T2g, T2i);
Chris@10 334 T2k = VCONJ(VSUB(T2i, T2g));
Chris@10 335 T4d = VCONJ(VSUB(T4b, T43));
Chris@10 336 T4c = VADD(T43, T4b);
Chris@10 337 T3e = VZMULI(T3b, VFMAI(T3d, T3c));
Chris@10 338 T3P = VZMULI(T3O, VFNMSI(T3d, T3c));
Chris@10 339 T3j = VZMULI(T3i, VFNMSI(T33, T30));
Chris@10 340 T34 = VZMULI(T2X, VFMAI(T33, T30));
Chris@10 341 ST(&(Rp[WS(rs, 6)]), T1P, ms, &(Rp[0]));
Chris@10 342 ST(&(Rp[WS(rs, 13)]), T4w, ms, &(Rp[WS(rs, 1)]));
Chris@10 343 ST(&(Rp[WS(rs, 14)]), T2j, ms, &(Rp[0]));
Chris@10 344 ST(&(Rp[WS(rs, 5)]), T4c, ms, &(Rp[WS(rs, 1)]));
Chris@10 345 ST(&(Rm[WS(rs, 13)]), T4x, -ms, &(Rm[WS(rs, 1)]));
Chris@10 346 ST(&(Rm[WS(rs, 6)]), T1Q, -ms, &(Rm[0]));
Chris@10 347 T21 = VADD(T1Y, T20);
Chris@10 348 T22 = VCONJ(VSUB(T20, T1Y));
Chris@10 349 T4r = VCONJ(VSUB(T4p, T4l));
Chris@10 350 T4q = VADD(T4l, T4p);
Chris@10 351 T2b = VADD(T26, T2a);
Chris@10 352 T2c = VCONJ(VSUB(T2a, T26));
Chris@10 353 T4j = VCONJ(VSUB(T4h, T4f));
Chris@10 354 T4i = VADD(T4f, T4h);
Chris@10 355 ST(&(Rm[WS(rs, 5)]), T4d, -ms, &(Rm[WS(rs, 1)]));
Chris@10 356 ST(&(Rm[WS(rs, 14)]), T2k, -ms, &(Rm[0]));
Chris@10 357 ST(&(Rp[WS(rs, 10)]), T21, ms, &(Rp[0]));
Chris@10 358 ST(&(Rp[WS(rs, 3)]), T4q, ms, &(Rp[WS(rs, 1)]));
Chris@10 359 ST(&(Rp[WS(rs, 2)]), T2b, ms, &(Rp[0]));
Chris@10 360 ST(&(Rp[WS(rs, 11)]), T4i, ms, &(Rp[WS(rs, 1)]));
Chris@10 361 ST(&(Rm[WS(rs, 3)]), T4r, -ms, &(Rm[WS(rs, 1)]));
Chris@10 362 ST(&(Rm[WS(rs, 10)]), T22, -ms, &(Rm[0]));
Chris@10 363 T2N = VADD(T2E, T2M);
Chris@10 364 T2O = VCONJ(VSUB(T2M, T2E));
Chris@10 365 T41 = VCONJ(VSUB(T3Z, T3X));
Chris@10 366 T40 = VADD(T3X, T3Z);
Chris@10 367 T3g = VADD(T3e, T3f);
Chris@10 368 T3h = VCONJ(VSUB(T3f, T3e));
Chris@10 369 T3H = VCONJ(VSUB(T3F, T3j));
Chris@10 370 T3G = VADD(T3j, T3F);
Chris@10 371 ST(&(Rm[WS(rs, 11)]), T4j, -ms, &(Rm[WS(rs, 1)]));
Chris@10 372 ST(&(Rm[WS(rs, 2)]), T2c, -ms, &(Rm[0]));
Chris@10 373 ST(&(Rp[WS(rs, 12)]), T2N, ms, &(Rp[0]));
Chris@10 374 ST(&(Rp[WS(rs, 1)]), T40, ms, &(Rp[WS(rs, 1)]));
Chris@10 375 ST(&(Rp[0]), T3g, ms, &(Rp[0]));
Chris@10 376 ST(&(Rp[WS(rs, 7)]), T3G, ms, &(Rp[WS(rs, 1)]));
Chris@10 377 ST(&(Rm[WS(rs, 1)]), T41, -ms, &(Rm[WS(rs, 1)]));
Chris@10 378 ST(&(Rm[WS(rs, 12)]), T2O, -ms, &(Rm[0]));
Chris@10 379 T2V = VADD(T2S, T2U);
Chris@10 380 T2W = VCONJ(VSUB(T2U, T2S));
Chris@10 381 T3V = VCONJ(VSUB(T3T, T3P));
Chris@10 382 T3U = VADD(T3P, T3T);
Chris@10 383 T39 = VADD(T34, T38);
Chris@10 384 T3a = VCONJ(VSUB(T38, T34));
Chris@10 385 T3N = VCONJ(VSUB(T3L, T3J));
Chris@10 386 T3M = VADD(T3J, T3L);
Chris@10 387 ST(&(Rm[WS(rs, 7)]), T3H, -ms, &(Rm[WS(rs, 1)]));
Chris@10 388 ST(&(Rm[0]), T3h, -ms, &(Rm[0]));
Chris@10 389 ST(&(Rp[WS(rs, 4)]), T2V, ms, &(Rp[0]));
Chris@10 390 ST(&(Rp[WS(rs, 15)]), T3U, ms, &(Rp[WS(rs, 1)]));
Chris@10 391 ST(&(Rp[WS(rs, 8)]), T39, ms, &(Rp[0]));
Chris@10 392 ST(&(Rp[WS(rs, 9)]), T3M, ms, &(Rp[WS(rs, 1)]));
Chris@10 393 ST(&(Rm[WS(rs, 15)]), T3V, -ms, &(Rm[WS(rs, 1)]));
Chris@10 394 ST(&(Rm[WS(rs, 4)]), T2W, -ms, &(Rm[0]));
Chris@10 395 }
Chris@10 396 }
Chris@10 397 }
Chris@10 398 ST(&(Rm[WS(rs, 9)]), T3N, -ms, &(Rm[WS(rs, 1)]));
Chris@10 399 ST(&(Rm[WS(rs, 8)]), T3a, -ms, &(Rm[0]));
Chris@10 400 }
Chris@10 401 }
Chris@10 402 VLEAVE();
Chris@10 403 }
Chris@10 404
Chris@10 405 static const tw_instr twinstr[] = {
Chris@10 406 VTW(1, 1),
Chris@10 407 VTW(1, 2),
Chris@10 408 VTW(1, 3),
Chris@10 409 VTW(1, 4),
Chris@10 410 VTW(1, 5),
Chris@10 411 VTW(1, 6),
Chris@10 412 VTW(1, 7),
Chris@10 413 VTW(1, 8),
Chris@10 414 VTW(1, 9),
Chris@10 415 VTW(1, 10),
Chris@10 416 VTW(1, 11),
Chris@10 417 VTW(1, 12),
Chris@10 418 VTW(1, 13),
Chris@10 419 VTW(1, 14),
Chris@10 420 VTW(1, 15),
Chris@10 421 VTW(1, 16),
Chris@10 422 VTW(1, 17),
Chris@10 423 VTW(1, 18),
Chris@10 424 VTW(1, 19),
Chris@10 425 VTW(1, 20),
Chris@10 426 VTW(1, 21),
Chris@10 427 VTW(1, 22),
Chris@10 428 VTW(1, 23),
Chris@10 429 VTW(1, 24),
Chris@10 430 VTW(1, 25),
Chris@10 431 VTW(1, 26),
Chris@10 432 VTW(1, 27),
Chris@10 433 VTW(1, 28),
Chris@10 434 VTW(1, 29),
Chris@10 435 VTW(1, 30),
Chris@10 436 VTW(1, 31),
Chris@10 437 {TW_NEXT, VL, 0}
Chris@10 438 };
Chris@10 439
Chris@10 440 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cbdftv_32"), twinstr, &GENUS, {119, 62, 130, 0} };
Chris@10 441
Chris@10 442 void XSIMD(codelet_hc2cbdftv_32) (planner *p) {
Chris@10 443 X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);
Chris@10 444 }
Chris@10 445 #else /* HAVE_FMA */
Chris@10 446
Chris@10 447 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include hc2cbv.h */
Chris@10 448
Chris@10 449 /*
Chris@10 450 * This function contains 249 FP additions, 104 FP multiplications,
Chris@10 451 * (or, 233 additions, 88 multiplications, 16 fused multiply/add),
Chris@10 452 * 161 stack variables, 7 constants, and 64 memory accesses
Chris@10 453 */
Chris@10 454 #include "hc2cbv.h"
Chris@10 455
Chris@10 456 static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 457 {
Chris@10 458 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 459 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 460 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 461 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 462 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 463 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 464 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 465 {
Chris@10 466 INT m;
Chris@10 467 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
Chris@10 468 V T1W, T21, Tf, T2c, T1t, T2r, T3T, T4m, Ty, T2q, T3P, T4n, T1n, T2d, T1T;
Chris@10 469 V T22, T1E, T24, T3I, T4p, TU, T2n, T1i, T2h, T1L, T25, T3L, T4q, T1f, T2o;
Chris@10 470 V T1j, T2k;
Chris@10 471 {
Chris@10 472 V T2, T4, T1Z, T1p, T1r, T20, T9, T1U, Td, T1V, T3, T1q, T6, T8, T7;
Chris@10 473 V Tc, Tb, Ta, T5, Te, T1o, T1s, T3R, T3S, Tj, T1N, Tw, T1Q, Tn, T1O;
Chris@10 474 V Ts, T1R, Tg, Ti, Th, Tv, Tu, Tt, Tk, Tm, Tl, Tp, Tr, Tq, To;
Chris@10 475 V Tx, T3N, T3O, T1l, T1m, T1P, T1S;
Chris@10 476 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@10 477 T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 478 T4 = VCONJ(T3);
Chris@10 479 T1Z = VADD(T2, T4);
Chris@10 480 T1p = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
Chris@10 481 T1q = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 482 T1r = VCONJ(T1q);
Chris@10 483 T20 = VADD(T1p, T1r);
Chris@10 484 T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@10 485 T7 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 486 T8 = VCONJ(T7);
Chris@10 487 T9 = VSUB(T6, T8);
Chris@10 488 T1U = VADD(T6, T8);
Chris@10 489 Tc = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
Chris@10 490 Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 491 Tb = VCONJ(Ta);
Chris@10 492 Td = VSUB(Tb, Tc);
Chris@10 493 T1V = VADD(Tb, Tc);
Chris@10 494 T1W = VSUB(T1U, T1V);
Chris@10 495 T21 = VSUB(T1Z, T20);
Chris@10 496 T5 = VSUB(T2, T4);
Chris@10 497 Te = VMUL(LDK(KP707106781), VADD(T9, Td));
Chris@10 498 Tf = VSUB(T5, Te);
Chris@10 499 T2c = VADD(T5, Te);
Chris@10 500 T1o = VMUL(LDK(KP707106781), VSUB(T9, Td));
Chris@10 501 T1s = VSUB(T1p, T1r);
Chris@10 502 T1t = VSUB(T1o, T1s);
Chris@10 503 T2r = VADD(T1s, T1o);
Chris@10 504 T3R = VADD(T1Z, T20);
Chris@10 505 T3S = VADD(T1U, T1V);
Chris@10 506 T3T = VSUB(T3R, T3S);
Chris@10 507 T4m = VADD(T3R, T3S);
Chris@10 508 Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@10 509 Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 510 Ti = VCONJ(Th);
Chris@10 511 Tj = VSUB(Tg, Ti);
Chris@10 512 T1N = VADD(Tg, Ti);
Chris@10 513 Tv = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
Chris@10 514 Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 515 Tu = VCONJ(Tt);
Chris@10 516 Tw = VSUB(Tu, Tv);
Chris@10 517 T1Q = VADD(Tu, Tv);
Chris@10 518 Tk = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
Chris@10 519 Tl = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 520 Tm = VCONJ(Tl);
Chris@10 521 Tn = VSUB(Tk, Tm);
Chris@10 522 T1O = VADD(Tk, Tm);
Chris@10 523 Tp = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@10 524 Tq = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 525 Tr = VCONJ(Tq);
Chris@10 526 Ts = VSUB(Tp, Tr);
Chris@10 527 T1R = VADD(Tp, Tr);
Chris@10 528 To = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn));
Chris@10 529 Tx = VFNMS(LDK(KP382683432), Tw, VMUL(LDK(KP923879532), Ts));
Chris@10 530 Ty = VSUB(To, Tx);
Chris@10 531 T2q = VADD(To, Tx);
Chris@10 532 T3N = VADD(T1N, T1O);
Chris@10 533 T3O = VADD(T1Q, T1R);
Chris@10 534 T3P = VSUB(T3N, T3O);
Chris@10 535 T4n = VADD(T3N, T3O);
Chris@10 536 T1l = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj));
Chris@10 537 T1m = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), Ts));
Chris@10 538 T1n = VSUB(T1l, T1m);
Chris@10 539 T2d = VADD(T1l, T1m);
Chris@10 540 T1P = VSUB(T1N, T1O);
Chris@10 541 T1S = VSUB(T1Q, T1R);
Chris@10 542 T1T = VMUL(LDK(KP707106781), VSUB(T1P, T1S));
Chris@10 543 T22 = VMUL(LDK(KP707106781), VADD(T1P, T1S));
Chris@10 544 }
Chris@10 545 {
Chris@10 546 V TD, T1B, TR, T1y, TH, T1C, TM, T1z, TA, TC, TB, TO, TQ, TP, TG;
Chris@10 547 V TF, TE, TJ, TL, TK, T1A, T1D, T3G, T3H, TN, T2f, TT, T2g, TI, TS;
Chris@10 548 V TY, T1I, T1c, T1F, T12, T1J, T17, T1G, TV, TX, TW, T1b, T1a, T19, T11;
Chris@10 549 V T10, TZ, T14, T16, T15, T1H, T1K, T3J, T3K, T18, T2i, T1e, T2j, T13, T1d;
Chris@10 550 TA = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 551 TB = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
Chris@10 552 TC = VCONJ(TB);
Chris@10 553 TD = VSUB(TA, TC);
Chris@10 554 T1B = VADD(TA, TC);
Chris@10 555 TO = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 556 TP = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
Chris@10 557 TQ = VCONJ(TP);
Chris@10 558 TR = VSUB(TO, TQ);
Chris@10 559 T1y = VADD(TO, TQ);
Chris@10 560 TG = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 561 TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@10 562 TF = VCONJ(TE);
Chris@10 563 TH = VSUB(TF, TG);
Chris@10 564 T1C = VADD(TF, TG);
Chris@10 565 TJ = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 566 TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@10 567 TL = VCONJ(TK);
Chris@10 568 TM = VSUB(TJ, TL);
Chris@10 569 T1z = VADD(TJ, TL);
Chris@10 570 T1A = VSUB(T1y, T1z);
Chris@10 571 T1D = VSUB(T1B, T1C);
Chris@10 572 T1E = VFNMS(LDK(KP382683432), T1D, VMUL(LDK(KP923879532), T1A));
Chris@10 573 T24 = VFMA(LDK(KP382683432), T1A, VMUL(LDK(KP923879532), T1D));
Chris@10 574 T3G = VADD(T1y, T1z);
Chris@10 575 T3H = VADD(T1B, T1C);
Chris@10 576 T3I = VSUB(T3G, T3H);
Chris@10 577 T4p = VADD(T3G, T3H);
Chris@10 578 TI = VMUL(LDK(KP707106781), VSUB(TD, TH));
Chris@10 579 TN = VSUB(TI, TM);
Chris@10 580 T2f = VADD(TM, TI);
Chris@10 581 TS = VMUL(LDK(KP707106781), VADD(TD, TH));
Chris@10 582 TT = VSUB(TR, TS);
Chris@10 583 T2g = VADD(TR, TS);
Chris@10 584 TU = VFMA(LDK(KP831469612), TN, VMUL(LDK(KP555570233), TT));
Chris@10 585 T2n = VFNMS(LDK(KP195090322), T2f, VMUL(LDK(KP980785280), T2g));
Chris@10 586 T1i = VFNMS(LDK(KP555570233), TN, VMUL(LDK(KP831469612), TT));
Chris@10 587 T2h = VFMA(LDK(KP980785280), T2f, VMUL(LDK(KP195090322), T2g));
Chris@10 588 TV = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 589 TW = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
Chris@10 590 TX = VCONJ(TW);
Chris@10 591 TY = VSUB(TV, TX);
Chris@10 592 T1I = VADD(TV, TX);
Chris@10 593 T1b = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 594 T19 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@10 595 T1a = VCONJ(T19);
Chris@10 596 T1c = VSUB(T1a, T1b);
Chris@10 597 T1F = VADD(T1a, T1b);
Chris@10 598 T11 = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 599 TZ = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@10 600 T10 = VCONJ(TZ);
Chris@10 601 T12 = VSUB(T10, T11);
Chris@10 602 T1J = VADD(T10, T11);
Chris@10 603 T14 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 604 T15 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
Chris@10 605 T16 = VCONJ(T15);
Chris@10 606 T17 = VSUB(T14, T16);
Chris@10 607 T1G = VADD(T14, T16);
Chris@10 608 T1H = VSUB(T1F, T1G);
Chris@10 609 T1K = VSUB(T1I, T1J);
Chris@10 610 T1L = VFMA(LDK(KP923879532), T1H, VMUL(LDK(KP382683432), T1K));
Chris@10 611 T25 = VFNMS(LDK(KP382683432), T1H, VMUL(LDK(KP923879532), T1K));
Chris@10 612 T3J = VADD(T1F, T1G);
Chris@10 613 T3K = VADD(T1I, T1J);
Chris@10 614 T3L = VSUB(T3J, T3K);
Chris@10 615 T4q = VADD(T3J, T3K);
Chris@10 616 T13 = VMUL(LDK(KP707106781), VSUB(TY, T12));
Chris@10 617 T18 = VSUB(T13, T17);
Chris@10 618 T2i = VADD(T17, T13);
Chris@10 619 T1d = VMUL(LDK(KP707106781), VADD(TY, T12));
Chris@10 620 T1e = VSUB(T1c, T1d);
Chris@10 621 T2j = VADD(T1c, T1d);
Chris@10 622 T1f = VFNMS(LDK(KP555570233), T1e, VMUL(LDK(KP831469612), T18));
Chris@10 623 T2o = VFMA(LDK(KP195090322), T2i, VMUL(LDK(KP980785280), T2j));
Chris@10 624 T1j = VFMA(LDK(KP555570233), T18, VMUL(LDK(KP831469612), T1e));
Chris@10 625 T2k = VFNMS(LDK(KP195090322), T2j, VMUL(LDK(KP980785280), T2i));
Chris@10 626 }
Chris@10 627 {
Chris@10 628 V T4L, T4G, T4s, T4y, T3W, T4g, T42, T4a, T3g, T4e, T3o, T3E, T1w, T46, T2M;
Chris@10 629 V T40, T2u, T4w, T2C, T4k, T36, T3A, T3i, T3s, T28, T2O, T2w, T2G, T2Y, T4K;
Chris@10 630 V T3y, T4C;
Chris@10 631 {
Chris@10 632 V T4E, T4F, T4D, T4o, T4r, T4l, T4x, T3Q, T48, T3V, T49, T3M, T3U, T3F, T4f;
Chris@10 633 V T41, T47, T3c, T3n, T3f, T3m, T3a, T3b, T3d, T3e, T39, T4d, T3l, T3D, T1h;
Chris@10 634 V T2K, T1v, T2L, Tz, T1g, T1k, T1u, T1, T45, T2J, T3Z, T2m, T2A, T2t, T2B;
Chris@10 635 V T2e, T2l, T2p, T2s, T2b, T4v, T2z, T4j;
Chris@10 636 T4E = VADD(T4m, T4n);
Chris@10 637 T4F = VADD(T4p, T4q);
Chris@10 638 T4L = VADD(T4E, T4F);
Chris@10 639 T4D = LDW(&(W[TWVL * 30]));
Chris@10 640 T4G = VZMUL(T4D, VSUB(T4E, T4F));
Chris@10 641 T4o = VSUB(T4m, T4n);
Chris@10 642 T4r = VBYI(VSUB(T4p, T4q));
Chris@10 643 T4l = LDW(&(W[TWVL * 46]));
Chris@10 644 T4s = VZMUL(T4l, VSUB(T4o, T4r));
Chris@10 645 T4x = LDW(&(W[TWVL * 14]));
Chris@10 646 T4y = VZMUL(T4x, VADD(T4o, T4r));
Chris@10 647 T3M = VMUL(LDK(KP707106781), VSUB(T3I, T3L));
Chris@10 648 T3Q = VBYI(VSUB(T3M, T3P));
Chris@10 649 T48 = VBYI(VADD(T3P, T3M));
Chris@10 650 T3U = VMUL(LDK(KP707106781), VADD(T3I, T3L));
Chris@10 651 T3V = VSUB(T3T, T3U);
Chris@10 652 T49 = VADD(T3T, T3U);
Chris@10 653 T3F = LDW(&(W[TWVL * 22]));
Chris@10 654 T3W = VZMUL(T3F, VADD(T3Q, T3V));
Chris@10 655 T4f = LDW(&(W[TWVL * 54]));
Chris@10 656 T4g = VZMUL(T4f, VSUB(T49, T48));
Chris@10 657 T41 = LDW(&(W[TWVL * 38]));
Chris@10 658 T42 = VZMUL(T41, VSUB(T3V, T3Q));
Chris@10 659 T47 = LDW(&(W[TWVL * 6]));
Chris@10 660 T4a = VZMUL(T47, VADD(T48, T49));
Chris@10 661 T3a = VADD(T1t, T1n);
Chris@10 662 T3b = VADD(TU, T1f);
Chris@10 663 T3c = VBYI(VADD(T3a, T3b));
Chris@10 664 T3n = VBYI(VSUB(T3b, T3a));
Chris@10 665 T3d = VADD(Tf, Ty);
Chris@10 666 T3e = VADD(T1i, T1j);
Chris@10 667 T3f = VADD(T3d, T3e);
Chris@10 668 T3m = VSUB(T3d, T3e);
Chris@10 669 T39 = LDW(&(W[TWVL * 4]));
Chris@10 670 T3g = VZMULI(T39, VADD(T3c, T3f));
Chris@10 671 T4d = LDW(&(W[TWVL * 56]));
Chris@10 672 T4e = VZMULI(T4d, VSUB(T3f, T3c));
Chris@10 673 T3l = LDW(&(W[TWVL * 36]));
Chris@10 674 T3o = VZMULI(T3l, VSUB(T3m, T3n));
Chris@10 675 T3D = LDW(&(W[TWVL * 24]));
Chris@10 676 T3E = VZMULI(T3D, VADD(T3n, T3m));
Chris@10 677 Tz = VSUB(Tf, Ty);
Chris@10 678 T1g = VSUB(TU, T1f);
Chris@10 679 T1h = VSUB(Tz, T1g);
Chris@10 680 T2K = VADD(Tz, T1g);
Chris@10 681 T1k = VSUB(T1i, T1j);
Chris@10 682 T1u = VSUB(T1n, T1t);
Chris@10 683 T1v = VBYI(VSUB(T1k, T1u));
Chris@10 684 T2L = VBYI(VADD(T1u, T1k));
Chris@10 685 T1 = LDW(&(W[TWVL * 20]));
Chris@10 686 T1w = VZMULI(T1, VADD(T1h, T1v));
Chris@10 687 T45 = LDW(&(W[TWVL * 8]));
Chris@10 688 T46 = VZMULI(T45, VADD(T2K, T2L));
Chris@10 689 T2J = LDW(&(W[TWVL * 52]));
Chris@10 690 T2M = VZMULI(T2J, VSUB(T2K, T2L));
Chris@10 691 T3Z = LDW(&(W[TWVL * 40]));
Chris@10 692 T40 = VZMULI(T3Z, VSUB(T1h, T1v));
Chris@10 693 T2e = VSUB(T2c, T2d);
Chris@10 694 T2l = VSUB(T2h, T2k);
Chris@10 695 T2m = VSUB(T2e, T2l);
Chris@10 696 T2A = VADD(T2e, T2l);
Chris@10 697 T2p = VSUB(T2n, T2o);
Chris@10 698 T2s = VSUB(T2q, T2r);
Chris@10 699 T2t = VBYI(VSUB(T2p, T2s));
Chris@10 700 T2B = VBYI(VADD(T2s, T2p));
Chris@10 701 T2b = LDW(&(W[TWVL * 44]));
Chris@10 702 T2u = VZMULI(T2b, VSUB(T2m, T2t));
Chris@10 703 T4v = LDW(&(W[TWVL * 16]));
Chris@10 704 T4w = VZMULI(T4v, VADD(T2m, T2t));
Chris@10 705 T2z = LDW(&(W[TWVL * 12]));
Chris@10 706 T2C = VZMULI(T2z, VADD(T2A, T2B));
Chris@10 707 T4j = LDW(&(W[TWVL * 48]));
Chris@10 708 T4k = VZMULI(T4j, VSUB(T2A, T2B));
Chris@10 709 {
Chris@10 710 V T32, T3q, T35, T3r, T30, T31, T33, T34, T2Z, T3z, T3h, T3p, T1Y, T2E, T27;
Chris@10 711 V T2F, T1M, T1X, T23, T26, T1x, T2N, T2v, T2D, T2U, T3x, T2X, T3w, T2S, T2T;
Chris@10 712 V T2V, T2W, T2R, T4J, T3v, T4B;
Chris@10 713 T30 = VADD(T21, T22);
Chris@10 714 T31 = VADD(T1E, T1L);
Chris@10 715 T32 = VADD(T30, T31);
Chris@10 716 T3q = VSUB(T30, T31);
Chris@10 717 T33 = VADD(T1W, T1T);
Chris@10 718 T34 = VADD(T24, T25);
Chris@10 719 T35 = VBYI(VADD(T33, T34));
Chris@10 720 T3r = VBYI(VSUB(T34, T33));
Chris@10 721 T2Z = LDW(&(W[TWVL * 58]));
Chris@10 722 T36 = VZMUL(T2Z, VSUB(T32, T35));
Chris@10 723 T3z = LDW(&(W[TWVL * 26]));
Chris@10 724 T3A = VZMUL(T3z, VADD(T3q, T3r));
Chris@10 725 T3h = LDW(&(W[TWVL * 2]));
Chris@10 726 T3i = VZMUL(T3h, VADD(T32, T35));
Chris@10 727 T3p = LDW(&(W[TWVL * 34]));
Chris@10 728 T3s = VZMUL(T3p, VSUB(T3q, T3r));
Chris@10 729 T1M = VSUB(T1E, T1L);
Chris@10 730 T1X = VSUB(T1T, T1W);
Chris@10 731 T1Y = VBYI(VSUB(T1M, T1X));
Chris@10 732 T2E = VBYI(VADD(T1X, T1M));
Chris@10 733 T23 = VSUB(T21, T22);
Chris@10 734 T26 = VSUB(T24, T25);
Chris@10 735 T27 = VSUB(T23, T26);
Chris@10 736 T2F = VADD(T23, T26);
Chris@10 737 T1x = LDW(&(W[TWVL * 18]));
Chris@10 738 T28 = VZMUL(T1x, VADD(T1Y, T27));
Chris@10 739 T2N = LDW(&(W[TWVL * 50]));
Chris@10 740 T2O = VZMUL(T2N, VSUB(T2F, T2E));
Chris@10 741 T2v = LDW(&(W[TWVL * 42]));
Chris@10 742 T2w = VZMUL(T2v, VSUB(T27, T1Y));
Chris@10 743 T2D = LDW(&(W[TWVL * 10]));
Chris@10 744 T2G = VZMUL(T2D, VADD(T2E, T2F));
Chris@10 745 T2S = VADD(T2c, T2d);
Chris@10 746 T2T = VADD(T2n, T2o);
Chris@10 747 T2U = VADD(T2S, T2T);
Chris@10 748 T3x = VSUB(T2S, T2T);
Chris@10 749 T2V = VADD(T2r, T2q);
Chris@10 750 T2W = VADD(T2h, T2k);
Chris@10 751 T2X = VBYI(VADD(T2V, T2W));
Chris@10 752 T3w = VBYI(VSUB(T2W, T2V));
Chris@10 753 T2R = LDW(&(W[TWVL * 60]));
Chris@10 754 T2Y = VZMULI(T2R, VSUB(T2U, T2X));
Chris@10 755 T4J = LDW(&(W[0]));
Chris@10 756 T4K = VZMULI(T4J, VADD(T2X, T2U));
Chris@10 757 T3v = LDW(&(W[TWVL * 28]));
Chris@10 758 T3y = VZMULI(T3v, VADD(T3w, T3x));
Chris@10 759 T4B = LDW(&(W[TWVL * 32]));
Chris@10 760 T4C = VZMULI(T4B, VSUB(T3x, T3w));
Chris@10 761 }
Chris@10 762 }
Chris@10 763 {
Chris@10 764 V T29, T4M, T2P, T4t, T4N, T2a, T4u, T2Q, T2x, T4H, T2H, T4z, T4I, T2y, T4A;
Chris@10 765 V T2I, T37, T4h, T3B, T3X, T4i, T38, T3Y, T3C, T3j, T4b, T3t, T43, T4c, T3k;
Chris@10 766 V T44, T3u;
Chris@10 767 T29 = VADD(T1w, T28);
Chris@10 768 ST(&(Rp[WS(rs, 5)]), T29, ms, &(Rp[WS(rs, 1)]));
Chris@10 769 T4M = VADD(T4K, T4L);
Chris@10 770 ST(&(Rp[0]), T4M, ms, &(Rp[0]));
Chris@10 771 T2P = VADD(T2M, T2O);
Chris@10 772 ST(&(Rp[WS(rs, 13)]), T2P, ms, &(Rp[WS(rs, 1)]));
Chris@10 773 T4t = VADD(T4k, T4s);
Chris@10 774 ST(&(Rp[WS(rs, 12)]), T4t, ms, &(Rp[0]));
Chris@10 775 T4N = VCONJ(VSUB(T4L, T4K));
Chris@10 776 ST(&(Rm[0]), T4N, -ms, &(Rm[0]));
Chris@10 777 T2a = VCONJ(VSUB(T28, T1w));
Chris@10 778 ST(&(Rm[WS(rs, 5)]), T2a, -ms, &(Rm[WS(rs, 1)]));
Chris@10 779 T4u = VCONJ(VSUB(T4s, T4k));
Chris@10 780 ST(&(Rm[WS(rs, 12)]), T4u, -ms, &(Rm[0]));
Chris@10 781 T2Q = VCONJ(VSUB(T2O, T2M));
Chris@10 782 ST(&(Rm[WS(rs, 13)]), T2Q, -ms, &(Rm[WS(rs, 1)]));
Chris@10 783 T2x = VADD(T2u, T2w);
Chris@10 784 ST(&(Rp[WS(rs, 11)]), T2x, ms, &(Rp[WS(rs, 1)]));
Chris@10 785 T4H = VADD(T4C, T4G);
Chris@10 786 ST(&(Rp[WS(rs, 8)]), T4H, ms, &(Rp[0]));
Chris@10 787 T2H = VADD(T2C, T2G);
Chris@10 788 ST(&(Rp[WS(rs, 3)]), T2H, ms, &(Rp[WS(rs, 1)]));
Chris@10 789 T4z = VADD(T4w, T4y);
Chris@10 790 ST(&(Rp[WS(rs, 4)]), T4z, ms, &(Rp[0]));
Chris@10 791 T4I = VCONJ(VSUB(T4G, T4C));
Chris@10 792 ST(&(Rm[WS(rs, 8)]), T4I, -ms, &(Rm[0]));
Chris@10 793 T2y = VCONJ(VSUB(T2w, T2u));
Chris@10 794 ST(&(Rm[WS(rs, 11)]), T2y, -ms, &(Rm[WS(rs, 1)]));
Chris@10 795 T4A = VCONJ(VSUB(T4y, T4w));
Chris@10 796 ST(&(Rm[WS(rs, 4)]), T4A, -ms, &(Rm[0]));
Chris@10 797 T2I = VCONJ(VSUB(T2G, T2C));
Chris@10 798 ST(&(Rm[WS(rs, 3)]), T2I, -ms, &(Rm[WS(rs, 1)]));
Chris@10 799 T37 = VADD(T2Y, T36);
Chris@10 800 ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
Chris@10 801 T4h = VADD(T4e, T4g);
Chris@10 802 ST(&(Rp[WS(rs, 14)]), T4h, ms, &(Rp[0]));
Chris@10 803 T3B = VADD(T3y, T3A);
Chris@10 804 ST(&(Rp[WS(rs, 7)]), T3B, ms, &(Rp[WS(rs, 1)]));
Chris@10 805 T3X = VADD(T3E, T3W);
Chris@10 806 ST(&(Rp[WS(rs, 6)]), T3X, ms, &(Rp[0]));
Chris@10 807 T4i = VCONJ(VSUB(T4g, T4e));
Chris@10 808 ST(&(Rm[WS(rs, 14)]), T4i, -ms, &(Rm[0]));
Chris@10 809 T38 = VCONJ(VSUB(T36, T2Y));
Chris@10 810 ST(&(Rm[WS(rs, 15)]), T38, -ms, &(Rm[WS(rs, 1)]));
Chris@10 811 T3Y = VCONJ(VSUB(T3W, T3E));
Chris@10 812 ST(&(Rm[WS(rs, 6)]), T3Y, -ms, &(Rm[0]));
Chris@10 813 T3C = VCONJ(VSUB(T3A, T3y));
Chris@10 814 ST(&(Rm[WS(rs, 7)]), T3C, -ms, &(Rm[WS(rs, 1)]));
Chris@10 815 T3j = VADD(T3g, T3i);
Chris@10 816 ST(&(Rp[WS(rs, 1)]), T3j, ms, &(Rp[WS(rs, 1)]));
Chris@10 817 T4b = VADD(T46, T4a);
Chris@10 818 ST(&(Rp[WS(rs, 2)]), T4b, ms, &(Rp[0]));
Chris@10 819 T3t = VADD(T3o, T3s);
Chris@10 820 ST(&(Rp[WS(rs, 9)]), T3t, ms, &(Rp[WS(rs, 1)]));
Chris@10 821 T43 = VADD(T40, T42);
Chris@10 822 ST(&(Rp[WS(rs, 10)]), T43, ms, &(Rp[0]));
Chris@10 823 T4c = VCONJ(VSUB(T4a, T46));
Chris@10 824 ST(&(Rm[WS(rs, 2)]), T4c, -ms, &(Rm[0]));
Chris@10 825 T3k = VCONJ(VSUB(T3i, T3g));
Chris@10 826 ST(&(Rm[WS(rs, 1)]), T3k, -ms, &(Rm[WS(rs, 1)]));
Chris@10 827 T44 = VCONJ(VSUB(T42, T40));
Chris@10 828 ST(&(Rm[WS(rs, 10)]), T44, -ms, &(Rm[0]));
Chris@10 829 T3u = VCONJ(VSUB(T3s, T3o));
Chris@10 830 ST(&(Rm[WS(rs, 9)]), T3u, -ms, &(Rm[WS(rs, 1)]));
Chris@10 831 }
Chris@10 832 }
Chris@10 833 }
Chris@10 834 }
Chris@10 835 VLEAVE();
Chris@10 836 }
Chris@10 837
Chris@10 838 static const tw_instr twinstr[] = {
Chris@10 839 VTW(1, 1),
Chris@10 840 VTW(1, 2),
Chris@10 841 VTW(1, 3),
Chris@10 842 VTW(1, 4),
Chris@10 843 VTW(1, 5),
Chris@10 844 VTW(1, 6),
Chris@10 845 VTW(1, 7),
Chris@10 846 VTW(1, 8),
Chris@10 847 VTW(1, 9),
Chris@10 848 VTW(1, 10),
Chris@10 849 VTW(1, 11),
Chris@10 850 VTW(1, 12),
Chris@10 851 VTW(1, 13),
Chris@10 852 VTW(1, 14),
Chris@10 853 VTW(1, 15),
Chris@10 854 VTW(1, 16),
Chris@10 855 VTW(1, 17),
Chris@10 856 VTW(1, 18),
Chris@10 857 VTW(1, 19),
Chris@10 858 VTW(1, 20),
Chris@10 859 VTW(1, 21),
Chris@10 860 VTW(1, 22),
Chris@10 861 VTW(1, 23),
Chris@10 862 VTW(1, 24),
Chris@10 863 VTW(1, 25),
Chris@10 864 VTW(1, 26),
Chris@10 865 VTW(1, 27),
Chris@10 866 VTW(1, 28),
Chris@10 867 VTW(1, 29),
Chris@10 868 VTW(1, 30),
Chris@10 869 VTW(1, 31),
Chris@10 870 {TW_NEXT, VL, 0}
Chris@10 871 };
Chris@10 872
Chris@10 873 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cbdftv_32"), twinstr, &GENUS, {233, 88, 16, 0} };
Chris@10 874
Chris@10 875 void XSIMD(codelet_hc2cbdftv_32) (planner *p) {
Chris@10 876 X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);
Chris@10 877 }
Chris@10 878 #endif /* HAVE_FMA */