annotate src/fftw-3.3.5/rdft/simd/common/hc2cfdftv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:52:40 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include hc2cfv.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 103 FP additions, 96 FP multiplications,
Chris@42 32 * (or, 53 additions, 46 multiplications, 50 fused multiply/add),
Chris@42 33 * 92 stack variables, 4 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cfv.h"
Chris@42 36
Chris@42 37 static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 40 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 42 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 46 V T8, Tc, TQ, TZ, T1J, T1x, T12, TH, T1I, T1q, Tp, TJ, Te, Tf, Td;
Chris@42 47 V TN, Tj, Tk, Ti, TK, Tg, TO, Tl, TL, T1r, Th, TR, T1y, T1s, Tq;
Chris@42 48 V TM, T1z, T1N, T1t, T10, Tr, T13, TS, T1K, T1A, T1E, T1u, T1f, T11, T1c;
Chris@42 49 V Ts, T1d, T14, T1g, TT;
Chris@42 50 {
Chris@42 51 V T3, Tw, TF, TW, Tz, TA, Ty, TX, T7, Tu, T1, T2, Tv, TD, TE;
Chris@42 52 V TC, TV, T5, T6, T4, Tt, TB, TY, T1o, T1v, Tx, Ta, Tb, T9, TP;
Chris@42 53 V T1w, TG, T1p, Tn, To, Tm, TI;
Chris@42 54 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@42 55 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@42 56 Tv = LDW(&(W[0]));
Chris@42 57 TD = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@42 58 TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@42 59 TC = LDW(&(W[TWVL * 8]));
Chris@42 60 TV = LDW(&(W[TWVL * 6]));
Chris@42 61 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@42 62 T6 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@42 63 T3 = VFMACONJ(T2, T1);
Chris@42 64 Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
Chris@42 65 T4 = LDW(&(W[TWVL * 14]));
Chris@42 66 Tt = LDW(&(W[TWVL * 16]));
Chris@42 67 TF = VZMULIJ(TC, VFNMSCONJ(TE, TD));
Chris@42 68 TW = VZMULJ(TV, VFMACONJ(TE, TD));
Chris@42 69 Tz = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@42 70 TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@42 71 Ty = LDW(&(W[TWVL * 24]));
Chris@42 72 TX = LDW(&(W[TWVL * 22]));
Chris@42 73 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
Chris@42 74 Tu = VZMULIJ(Tt, VFNMSCONJ(T6, T5));
Chris@42 75 Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 76 Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 77 T9 = LDW(&(W[TWVL * 2]));
Chris@42 78 TP = LDW(&(W[TWVL * 4]));
Chris@42 79 TB = VZMULIJ(Ty, VFNMSCONJ(TA, Tz));
Chris@42 80 TY = VZMULJ(TX, VFMACONJ(TA, Tz));
Chris@42 81 T1o = VADD(T3, T7);
Chris@42 82 T8 = VSUB(T3, T7);
Chris@42 83 T1v = VADD(Tw, Tu);
Chris@42 84 Tx = VSUB(Tu, Tw);
Chris@42 85 Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
Chris@42 86 TQ = VZMULIJ(TP, VFNMSCONJ(Tb, Ta));
Chris@42 87 T1w = VADD(TF, TB);
Chris@42 88 TG = VSUB(TB, TF);
Chris@42 89 T1p = VADD(TW, TY);
Chris@42 90 TZ = VSUB(TW, TY);
Chris@42 91 Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 92 To = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 93 Tm = LDW(&(W[TWVL * 10]));
Chris@42 94 TI = LDW(&(W[TWVL * 12]));
Chris@42 95 T1J = VSUB(T1w, T1v);
Chris@42 96 T1x = VADD(T1v, T1w);
Chris@42 97 T12 = VFMA(LDK(KP414213562), Tx, TG);
Chris@42 98 TH = VFNMS(LDK(KP414213562), TG, Tx);
Chris@42 99 T1I = VSUB(T1o, T1p);
Chris@42 100 T1q = VADD(T1o, T1p);
Chris@42 101 Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
Chris@42 102 TJ = VZMULIJ(TI, VFNMSCONJ(To, Tn));
Chris@42 103 Te = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 104 Tf = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 105 Td = LDW(&(W[TWVL * 18]));
Chris@42 106 TN = LDW(&(W[TWVL * 20]));
Chris@42 107 Tj = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 108 Tk = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 109 Ti = LDW(&(W[TWVL * 26]));
Chris@42 110 TK = LDW(&(W[TWVL * 28]));
Chris@42 111 }
Chris@42 112 Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
Chris@42 113 TO = VZMULIJ(TN, VFNMSCONJ(Tf, Te));
Chris@42 114 Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
Chris@42 115 TL = VZMULIJ(TK, VFNMSCONJ(Tk, Tj));
Chris@42 116 T1r = VADD(Tc, Tg);
Chris@42 117 Th = VSUB(Tc, Tg);
Chris@42 118 TR = VSUB(TO, TQ);
Chris@42 119 T1y = VADD(TQ, TO);
Chris@42 120 T1s = VADD(Tl, Tp);
Chris@42 121 Tq = VSUB(Tl, Tp);
Chris@42 122 TM = VSUB(TJ, TL);
Chris@42 123 T1z = VADD(TL, TJ);
Chris@42 124 T1N = VSUB(T1s, T1r);
Chris@42 125 T1t = VADD(T1r, T1s);
Chris@42 126 T10 = VSUB(Tq, Th);
Chris@42 127 Tr = VADD(Th, Tq);
Chris@42 128 T13 = VFNMS(LDK(KP414213562), TM, TR);
Chris@42 129 TS = VFMA(LDK(KP414213562), TR, TM);
Chris@42 130 T1K = VSUB(T1y, T1z);
Chris@42 131 T1A = VADD(T1y, T1z);
Chris@42 132 T1E = VADD(T1q, T1t);
Chris@42 133 T1u = VSUB(T1q, T1t);
Chris@42 134 T1f = VFMA(LDK(KP707106781), T10, TZ);
Chris@42 135 T11 = VFNMS(LDK(KP707106781), T10, TZ);
Chris@42 136 T1c = VFNMS(LDK(KP707106781), Tr, T8);
Chris@42 137 Ts = VFMA(LDK(KP707106781), Tr, T8);
Chris@42 138 T1d = VSUB(T12, T13);
Chris@42 139 T14 = VADD(T12, T13);
Chris@42 140 T1g = VSUB(TS, TH);
Chris@42 141 TT = VADD(TH, TS);
Chris@42 142 {
Chris@42 143 V T1O, T1L, T1F, T1B, T1k, T1e, T19, T15, T1l, T1h, T18, TU, T1T, T1P, T1S;
Chris@42 144 V T1M, T1H, T1G, T1D, T1C, T1m, T1n, T1j, T1i, T1a, T1b, T17, T16, T1U, T1V;
Chris@42 145 V T1R, T1Q;
Chris@42 146 T1O = VSUB(T1K, T1J);
Chris@42 147 T1L = VADD(T1J, T1K);
Chris@42 148 T1F = VADD(T1x, T1A);
Chris@42 149 T1B = VSUB(T1x, T1A);
Chris@42 150 T1k = VFNMS(LDK(KP923879532), T1d, T1c);
Chris@42 151 T1e = VFMA(LDK(KP923879532), T1d, T1c);
Chris@42 152 T19 = VFNMS(LDK(KP923879532), T14, T11);
Chris@42 153 T15 = VFMA(LDK(KP923879532), T14, T11);
Chris@42 154 T1l = VFNMS(LDK(KP923879532), T1g, T1f);
Chris@42 155 T1h = VFMA(LDK(KP923879532), T1g, T1f);
Chris@42 156 T18 = VFNMS(LDK(KP923879532), TT, Ts);
Chris@42 157 TU = VFMA(LDK(KP923879532), TT, Ts);
Chris@42 158 T1T = VFNMS(LDK(KP707106781), T1O, T1N);
Chris@42 159 T1P = VFMA(LDK(KP707106781), T1O, T1N);
Chris@42 160 T1S = VFNMS(LDK(KP707106781), T1L, T1I);
Chris@42 161 T1M = VFMA(LDK(KP707106781), T1L, T1I);
Chris@42 162 T1H = VCONJ(VMUL(LDK(KP500000000), VADD(T1F, T1E)));
Chris@42 163 T1G = VMUL(LDK(KP500000000), VSUB(T1E, T1F));
Chris@42 164 T1D = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1B, T1u)));
Chris@42 165 T1C = VMUL(LDK(KP500000000), VFMAI(T1B, T1u));
Chris@42 166 T1m = VMUL(LDK(KP500000000), VFNMSI(T1l, T1k));
Chris@42 167 T1n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1l, T1k)));
Chris@42 168 T1j = VMUL(LDK(KP500000000), VFMAI(T1h, T1e));
Chris@42 169 T1i = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1h, T1e)));
Chris@42 170 T1a = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T19, T18)));
Chris@42 171 T1b = VMUL(LDK(KP500000000), VFMAI(T19, T18));
Chris@42 172 T17 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T15, TU)));
Chris@42 173 T16 = VMUL(LDK(KP500000000), VFNMSI(T15, TU));
Chris@42 174 T1U = VMUL(LDK(KP500000000), VFNMSI(T1T, T1S));
Chris@42 175 T1V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1T, T1S)));
Chris@42 176 T1R = VMUL(LDK(KP500000000), VFMAI(T1P, T1M));
Chris@42 177 T1Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1P, T1M)));
Chris@42 178 ST(&(Rm[WS(rs, 7)]), T1H, -ms, &(Rm[WS(rs, 1)]));
Chris@42 179 ST(&(Rp[0]), T1G, ms, &(Rp[0]));
Chris@42 180 ST(&(Rm[WS(rs, 3)]), T1D, -ms, &(Rm[WS(rs, 1)]));
Chris@42 181 ST(&(Rp[WS(rs, 4)]), T1C, ms, &(Rp[0]));
Chris@42 182 ST(&(Rp[WS(rs, 5)]), T1m, ms, &(Rp[WS(rs, 1)]));
Chris@42 183 ST(&(Rm[WS(rs, 4)]), T1n, -ms, &(Rm[0]));
Chris@42 184 ST(&(Rp[WS(rs, 3)]), T1j, ms, &(Rp[WS(rs, 1)]));
Chris@42 185 ST(&(Rm[WS(rs, 2)]), T1i, -ms, &(Rm[0]));
Chris@42 186 ST(&(Rm[WS(rs, 6)]), T1a, -ms, &(Rm[0]));
Chris@42 187 ST(&(Rp[WS(rs, 7)]), T1b, ms, &(Rp[WS(rs, 1)]));
Chris@42 188 ST(&(Rm[0]), T17, -ms, &(Rm[0]));
Chris@42 189 ST(&(Rp[WS(rs, 1)]), T16, ms, &(Rp[WS(rs, 1)]));
Chris@42 190 ST(&(Rp[WS(rs, 6)]), T1U, ms, &(Rp[0]));
Chris@42 191 ST(&(Rm[WS(rs, 5)]), T1V, -ms, &(Rm[WS(rs, 1)]));
Chris@42 192 ST(&(Rp[WS(rs, 2)]), T1R, ms, &(Rp[0]));
Chris@42 193 ST(&(Rm[WS(rs, 1)]), T1Q, -ms, &(Rm[WS(rs, 1)]));
Chris@42 194 }
Chris@42 195 }
Chris@42 196 }
Chris@42 197 VLEAVE();
Chris@42 198 }
Chris@42 199
Chris@42 200 static const tw_instr twinstr[] = {
Chris@42 201 VTW(1, 1),
Chris@42 202 VTW(1, 2),
Chris@42 203 VTW(1, 3),
Chris@42 204 VTW(1, 4),
Chris@42 205 VTW(1, 5),
Chris@42 206 VTW(1, 6),
Chris@42 207 VTW(1, 7),
Chris@42 208 VTW(1, 8),
Chris@42 209 VTW(1, 9),
Chris@42 210 VTW(1, 10),
Chris@42 211 VTW(1, 11),
Chris@42 212 VTW(1, 12),
Chris@42 213 VTW(1, 13),
Chris@42 214 VTW(1, 14),
Chris@42 215 VTW(1, 15),
Chris@42 216 {TW_NEXT, VL, 0}
Chris@42 217 };
Chris@42 218
Chris@42 219 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, {53, 46, 50, 0} };
Chris@42 220
Chris@42 221 void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
Chris@42 222 X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
Chris@42 223 }
Chris@42 224 #else /* HAVE_FMA */
Chris@42 225
Chris@42 226 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include hc2cfv.h */
Chris@42 227
Chris@42 228 /*
Chris@42 229 * This function contains 103 FP additions, 56 FP multiplications,
Chris@42 230 * (or, 99 additions, 52 multiplications, 4 fused multiply/add),
Chris@42 231 * 101 stack variables, 5 constants, and 32 memory accesses
Chris@42 232 */
Chris@42 233 #include "hc2cfv.h"
Chris@42 234
Chris@42 235 static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 236 {
Chris@42 237 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 238 DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
Chris@42 239 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 240 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 241 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 242 {
Chris@42 243 INT m;
Chris@42 244 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 245 V T1D, T1E, T1R, TP, T1b, Ta, T1w, T18, T1x, T1z, T1A, T1G, T1H, T1S, Tx;
Chris@42 246 V T13, T10, T1a, T1, T3, TA, TM, TL, TN, T6, T8, TC, TH, TG, TI;
Chris@42 247 V T2, Tz, TK, TJ, T7, TB, TF, TE, TD, TO, T4, T9, T5, T15, T17;
Chris@42 248 V T14, T16;
Chris@42 249 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@42 250 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@42 251 T3 = VCONJ(T2);
Chris@42 252 Tz = LDW(&(W[0]));
Chris@42 253 TA = VZMULIJ(Tz, VSUB(T3, T1));
Chris@42 254 TM = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@42 255 TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@42 256 TL = VCONJ(TK);
Chris@42 257 TJ = LDW(&(W[TWVL * 24]));
Chris@42 258 TN = VZMULIJ(TJ, VSUB(TL, TM));
Chris@42 259 T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@42 260 T7 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@42 261 T8 = VCONJ(T7);
Chris@42 262 TB = LDW(&(W[TWVL * 16]));
Chris@42 263 TC = VZMULIJ(TB, VSUB(T8, T6));
Chris@42 264 TH = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@42 265 TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@42 266 TG = VCONJ(TF);
Chris@42 267 TE = LDW(&(W[TWVL * 8]));
Chris@42 268 TI = VZMULIJ(TE, VSUB(TG, TH));
Chris@42 269 T1D = VADD(TA, TC);
Chris@42 270 T1E = VADD(TI, TN);
Chris@42 271 T1R = VSUB(T1D, T1E);
Chris@42 272 TD = VSUB(TA, TC);
Chris@42 273 TO = VSUB(TI, TN);
Chris@42 274 TP = VFNMS(LDK(KP382683432), TO, VMUL(LDK(KP923879532), TD));
Chris@42 275 T1b = VFMA(LDK(KP382683432), TD, VMUL(LDK(KP923879532), TO));
Chris@42 276 T4 = VADD(T1, T3);
Chris@42 277 T5 = LDW(&(W[TWVL * 14]));
Chris@42 278 T9 = VZMULJ(T5, VADD(T6, T8));
Chris@42 279 Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
Chris@42 280 T1w = VADD(T4, T9);
Chris@42 281 T14 = LDW(&(W[TWVL * 6]));
Chris@42 282 T15 = VZMULJ(T14, VADD(TH, TG));
Chris@42 283 T16 = LDW(&(W[TWVL * 22]));
Chris@42 284 T17 = VZMULJ(T16, VADD(TM, TL));
Chris@42 285 T18 = VSUB(T15, T17);
Chris@42 286 T1x = VADD(T15, T17);
Chris@42 287 {
Chris@42 288 V Tf, TR, Tv, TY, Tk, TT, Tq, TW, Tc, Te, Td, Tb, TQ, Ts, Tu;
Chris@42 289 V Tt, Tr, TX, Th, Tj, Ti, Tg, TS, Tn, Tp, To, Tm, TV, Tl, Tw;
Chris@42 290 V TU, TZ;
Chris@42 291 Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 292 Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 293 Te = VCONJ(Td);
Chris@42 294 Tb = LDW(&(W[TWVL * 2]));
Chris@42 295 Tf = VZMULJ(Tb, VADD(Tc, Te));
Chris@42 296 TQ = LDW(&(W[TWVL * 4]));
Chris@42 297 TR = VZMULIJ(TQ, VSUB(Te, Tc));
Chris@42 298 Ts = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 299 Tt = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 300 Tu = VCONJ(Tt);
Chris@42 301 Tr = LDW(&(W[TWVL * 10]));
Chris@42 302 Tv = VZMULJ(Tr, VADD(Ts, Tu));
Chris@42 303 TX = LDW(&(W[TWVL * 12]));
Chris@42 304 TY = VZMULIJ(TX, VSUB(Tu, Ts));
Chris@42 305 Th = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 306 Ti = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 307 Tj = VCONJ(Ti);
Chris@42 308 Tg = LDW(&(W[TWVL * 18]));
Chris@42 309 Tk = VZMULJ(Tg, VADD(Th, Tj));
Chris@42 310 TS = LDW(&(W[TWVL * 20]));
Chris@42 311 TT = VZMULIJ(TS, VSUB(Tj, Th));
Chris@42 312 Tn = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 313 To = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 314 Tp = VCONJ(To);
Chris@42 315 Tm = LDW(&(W[TWVL * 26]));
Chris@42 316 Tq = VZMULJ(Tm, VADD(Tn, Tp));
Chris@42 317 TV = LDW(&(W[TWVL * 28]));
Chris@42 318 TW = VZMULIJ(TV, VSUB(Tp, Tn));
Chris@42 319 T1z = VADD(Tf, Tk);
Chris@42 320 T1A = VADD(Tq, Tv);
Chris@42 321 T1G = VADD(TR, TT);
Chris@42 322 T1H = VADD(TW, TY);
Chris@42 323 T1S = VSUB(T1H, T1G);
Chris@42 324 Tl = VSUB(Tf, Tk);
Chris@42 325 Tw = VSUB(Tq, Tv);
Chris@42 326 Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
Chris@42 327 T13 = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
Chris@42 328 TU = VSUB(TR, TT);
Chris@42 329 TZ = VSUB(TW, TY);
Chris@42 330 T10 = VFMA(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TZ));
Chris@42 331 T1a = VFNMS(LDK(KP923879532), TU, VMUL(LDK(KP382683432), TZ));
Chris@42 332 }
Chris@42 333 {
Chris@42 334 V T1U, T20, T1X, T21, T1Q, T1T, T1V, T1W, T1Y, T23, T1Z, T22, T1C, T1M, T1J;
Chris@42 335 V T1N, T1y, T1B, T1F, T1I, T1K, T1P, T1L, T1O, T12, T1g, T1d, T1h, Ty, T11;
Chris@42 336 V T19, T1c, T1e, T1j, T1f, T1i, T1m, T1s, T1p, T1t, T1k, T1l, T1n, T1o, T1q;
Chris@42 337 V T1v, T1r, T1u;
Chris@42 338 T1Q = VMUL(LDK(KP500000000), VSUB(T1w, T1x));
Chris@42 339 T1T = VMUL(LDK(KP353553390), VADD(T1R, T1S));
Chris@42 340 T1U = VADD(T1Q, T1T);
Chris@42 341 T20 = VSUB(T1Q, T1T);
Chris@42 342 T1V = VSUB(T1A, T1z);
Chris@42 343 T1W = VMUL(LDK(KP707106781), VSUB(T1S, T1R));
Chris@42 344 T1X = VMUL(LDK(KP500000000), VBYI(VADD(T1V, T1W)));
Chris@42 345 T21 = VMUL(LDK(KP500000000), VBYI(VSUB(T1W, T1V)));
Chris@42 346 T1Y = VCONJ(VSUB(T1U, T1X));
Chris@42 347 ST(&(Rm[WS(rs, 1)]), T1Y, -ms, &(Rm[WS(rs, 1)]));
Chris@42 348 T23 = VADD(T20, T21);
Chris@42 349 ST(&(Rp[WS(rs, 6)]), T23, ms, &(Rp[0]));
Chris@42 350 T1Z = VADD(T1U, T1X);
Chris@42 351 ST(&(Rp[WS(rs, 2)]), T1Z, ms, &(Rp[0]));
Chris@42 352 T22 = VCONJ(VSUB(T20, T21));
Chris@42 353 ST(&(Rm[WS(rs, 5)]), T22, -ms, &(Rm[WS(rs, 1)]));
Chris@42 354 T1y = VADD(T1w, T1x);
Chris@42 355 T1B = VADD(T1z, T1A);
Chris@42 356 T1C = VADD(T1y, T1B);
Chris@42 357 T1M = VSUB(T1y, T1B);
Chris@42 358 T1F = VADD(T1D, T1E);
Chris@42 359 T1I = VADD(T1G, T1H);
Chris@42 360 T1J = VADD(T1F, T1I);
Chris@42 361 T1N = VBYI(VSUB(T1I, T1F));
Chris@42 362 T1K = VCONJ(VMUL(LDK(KP500000000), VSUB(T1C, T1J)));
Chris@42 363 ST(&(Rm[WS(rs, 7)]), T1K, -ms, &(Rm[WS(rs, 1)]));
Chris@42 364 T1P = VMUL(LDK(KP500000000), VADD(T1M, T1N));
Chris@42 365 ST(&(Rp[WS(rs, 4)]), T1P, ms, &(Rp[0]));
Chris@42 366 T1L = VMUL(LDK(KP500000000), VADD(T1C, T1J));
Chris@42 367 ST(&(Rp[0]), T1L, ms, &(Rp[0]));
Chris@42 368 T1O = VCONJ(VMUL(LDK(KP500000000), VSUB(T1M, T1N)));
Chris@42 369 ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
Chris@42 370 Ty = VADD(Ta, Tx);
Chris@42 371 T11 = VMUL(LDK(KP500000000), VADD(TP, T10));
Chris@42 372 T12 = VADD(Ty, T11);
Chris@42 373 T1g = VSUB(Ty, T11);
Chris@42 374 T19 = VSUB(T13, T18);
Chris@42 375 T1c = VSUB(T1a, T1b);
Chris@42 376 T1d = VMUL(LDK(KP500000000), VBYI(VADD(T19, T1c)));
Chris@42 377 T1h = VMUL(LDK(KP500000000), VBYI(VSUB(T1c, T19)));
Chris@42 378 T1e = VCONJ(VSUB(T12, T1d));
Chris@42 379 ST(&(Rm[0]), T1e, -ms, &(Rm[0]));
Chris@42 380 T1j = VADD(T1g, T1h);
Chris@42 381 ST(&(Rp[WS(rs, 7)]), T1j, ms, &(Rp[WS(rs, 1)]));
Chris@42 382 T1f = VADD(T12, T1d);
Chris@42 383 ST(&(Rp[WS(rs, 1)]), T1f, ms, &(Rp[WS(rs, 1)]));
Chris@42 384 T1i = VCONJ(VSUB(T1g, T1h));
Chris@42 385 ST(&(Rm[WS(rs, 6)]), T1i, -ms, &(Rm[0]));
Chris@42 386 T1k = VSUB(T10, TP);
Chris@42 387 T1l = VADD(T18, T13);
Chris@42 388 T1m = VMUL(LDK(KP500000000), VBYI(VSUB(T1k, T1l)));
Chris@42 389 T1s = VMUL(LDK(KP500000000), VBYI(VADD(T1l, T1k)));
Chris@42 390 T1n = VSUB(Ta, Tx);
Chris@42 391 T1o = VMUL(LDK(KP500000000), VADD(T1b, T1a));
Chris@42 392 T1p = VSUB(T1n, T1o);
Chris@42 393 T1t = VADD(T1n, T1o);
Chris@42 394 T1q = VADD(T1m, T1p);
Chris@42 395 ST(&(Rp[WS(rs, 5)]), T1q, ms, &(Rp[WS(rs, 1)]));
Chris@42 396 T1v = VCONJ(VSUB(T1t, T1s));
Chris@42 397 ST(&(Rm[WS(rs, 2)]), T1v, -ms, &(Rm[0]));
Chris@42 398 T1r = VCONJ(VSUB(T1p, T1m));
Chris@42 399 ST(&(Rm[WS(rs, 4)]), T1r, -ms, &(Rm[0]));
Chris@42 400 T1u = VADD(T1s, T1t);
Chris@42 401 ST(&(Rp[WS(rs, 3)]), T1u, ms, &(Rp[WS(rs, 1)]));
Chris@42 402 }
Chris@42 403 }
Chris@42 404 }
Chris@42 405 VLEAVE();
Chris@42 406 }
Chris@42 407
Chris@42 408 static const tw_instr twinstr[] = {
Chris@42 409 VTW(1, 1),
Chris@42 410 VTW(1, 2),
Chris@42 411 VTW(1, 3),
Chris@42 412 VTW(1, 4),
Chris@42 413 VTW(1, 5),
Chris@42 414 VTW(1, 6),
Chris@42 415 VTW(1, 7),
Chris@42 416 VTW(1, 8),
Chris@42 417 VTW(1, 9),
Chris@42 418 VTW(1, 10),
Chris@42 419 VTW(1, 11),
Chris@42 420 VTW(1, 12),
Chris@42 421 VTW(1, 13),
Chris@42 422 VTW(1, 14),
Chris@42 423 VTW(1, 15),
Chris@42 424 {TW_NEXT, VL, 0}
Chris@42 425 };
Chris@42 426
Chris@42 427 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, {99, 52, 4, 0} };
Chris@42 428
Chris@42 429 void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
Chris@42 430 X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
Chris@42 431 }
Chris@42 432 #endif /* HAVE_FMA */