annotate src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:42:30 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-rdft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include hc2cbv.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 103 FP additions, 80 FP multiplications,
Chris@10 32 * (or, 53 additions, 30 multiplications, 50 fused multiply/add),
Chris@10 33 * 123 stack variables, 3 constants, and 32 memory accesses
Chris@10 34 */
Chris@10 35 #include "hc2cbv.h"
Chris@10 36
Chris@10 37 static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 42 {
Chris@10 43 INT m;
Chris@10 44 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 45 V T1D, T1F, TV, TW, T17, T18, T1B, T1A, T1H, T1G;
Chris@10 46 {
Chris@10 47 V T8, Tv, Tb, TF, Tl, TJ, TP, T1w, TE, T1t, T10, T1p, TG, Te, Tg;
Chris@10 48 V Th, T2, T3, Ts, Tt, T5, T6, Tp, Tq, T9, TA, T4, TC, Tu, TN;
Chris@10 49 V T7, TB, Tr, Ta, Tj, Tk, Tc, Td, TY, TD, TO, TZ, T1Q, T19, T1I;
Chris@10 50 V T1d, Tf, T11, TH, TQ, Ti, TI, T1k, T1K, T1S, T1r, T14, T16, TU, Ty;
Chris@10 51 V T1z, TX, T1o, T1, TK, TR, Tm, T12, T1C, Tz, T15;
Chris@10 52 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@10 53 T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 54 Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@10 55 Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 56 T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@10 57 T6 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 58 Tp = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@10 59 Tq = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 60 T9 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 61 TA = VFNMSCONJ(T3, T2);
Chris@10 62 T4 = VFMACONJ(T3, T2);
Chris@10 63 TC = VFMSCONJ(Tt, Ts);
Chris@10 64 Tu = VFMACONJ(Tt, Ts);
Chris@10 65 TN = VFNMSCONJ(T6, T5);
Chris@10 66 T7 = VFMACONJ(T6, T5);
Chris@10 67 TB = VFNMSCONJ(Tq, Tp);
Chris@10 68 Tr = VFMACONJ(Tq, Tp);
Chris@10 69 Ta = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@10 70 Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 71 Tk = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@10 72 Tc = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 73 Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@10 74 T8 = VSUB(T4, T7);
Chris@10 75 TY = VADD(T4, T7);
Chris@10 76 TD = VADD(TB, TC);
Chris@10 77 TO = VSUB(TB, TC);
Chris@10 78 Tv = VSUB(Tr, Tu);
Chris@10 79 TZ = VADD(Tr, Tu);
Chris@10 80 Tb = VFMACONJ(Ta, T9);
Chris@10 81 TF = VFNMSCONJ(Ta, T9);
Chris@10 82 Tl = VFMACONJ(Tk, Tj);
Chris@10 83 TJ = VFNMSCONJ(Tk, Tj);
Chris@10 84 TP = VFMA(LDK(KP707106781), TO, TN);
Chris@10 85 T1w = VFNMS(LDK(KP707106781), TO, TN);
Chris@10 86 TE = VFMA(LDK(KP707106781), TD, TA);
Chris@10 87 T1t = VFNMS(LDK(KP707106781), TD, TA);
Chris@10 88 T10 = VADD(TY, TZ);
Chris@10 89 T1p = VSUB(TY, TZ);
Chris@10 90 TG = VFNMSCONJ(Td, Tc);
Chris@10 91 Te = VFMACONJ(Td, Tc);
Chris@10 92 Tg = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 93 Th = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@10 94 T1Q = LDW(&(W[TWVL * 22]));
Chris@10 95 T19 = LDW(&(W[TWVL * 26]));
Chris@10 96 T1I = LDW(&(W[TWVL * 2]));
Chris@10 97 T1d = LDW(&(W[TWVL * 28]));
Chris@10 98 Tf = VSUB(Tb, Te);
Chris@10 99 T11 = VADD(Tb, Te);
Chris@10 100 TH = VFNMS(LDK(KP414213562), TG, TF);
Chris@10 101 TQ = VFMA(LDK(KP414213562), TF, TG);
Chris@10 102 Ti = VFMACONJ(Th, Tg);
Chris@10 103 TI = VFMSCONJ(Th, Tg);
Chris@10 104 T1k = LDW(&(W[0]));
Chris@10 105 T1K = LDW(&(W[TWVL * 4]));
Chris@10 106 T1S = LDW(&(W[TWVL * 24]));
Chris@10 107 TX = LDW(&(W[TWVL * 14]));
Chris@10 108 T1o = LDW(&(W[TWVL * 6]));
Chris@10 109 T1 = LDW(&(W[TWVL * 10]));
Chris@10 110 TK = VFMA(LDK(KP414213562), TJ, TI);
Chris@10 111 TR = VFNMS(LDK(KP414213562), TI, TJ);
Chris@10 112 Tm = VSUB(Ti, Tl);
Chris@10 113 T12 = VADD(Ti, Tl);
Chris@10 114 T1C = LDW(&(W[TWVL * 18]));
Chris@10 115 Tz = LDW(&(W[TWVL * 12]));
Chris@10 116 T15 = LDW(&(W[TWVL * 16]));
Chris@10 117 {
Chris@10 118 V T1v, T1y, T1N, T1g, T1J, T1c, T1U, T1V, T1m, T1n, T1s, TS, T1u, TL, T1x;
Chris@10 119 V T13, T1q, Tn, Tw, T1L, T1f, TT, T1M, T1e, TM, T1R, T1j, T1b, Tx, T1a;
Chris@10 120 V To, T1T, T1l, T1E, T1O, T1P, T1h, T1i;
Chris@10 121 T1s = LDW(&(W[TWVL * 8]));
Chris@10 122 TS = VADD(TQ, TR);
Chris@10 123 T1u = VSUB(TQ, TR);
Chris@10 124 TL = VADD(TH, TK);
Chris@10 125 T1x = VSUB(TH, TK);
Chris@10 126 T13 = VADD(T11, T12);
Chris@10 127 T1q = VSUB(T11, T12);
Chris@10 128 Tn = VADD(Tf, Tm);
Chris@10 129 Tw = VSUB(Tf, Tm);
Chris@10 130 T1L = VFMA(LDK(KP923879532), T1u, T1t);
Chris@10 131 T1v = VFNMS(LDK(KP923879532), T1u, T1t);
Chris@10 132 T1f = VFMA(LDK(KP923879532), TS, TP);
Chris@10 133 TT = VFNMS(LDK(KP923879532), TS, TP);
Chris@10 134 T1M = VFNMS(LDK(KP923879532), T1x, T1w);
Chris@10 135 T1y = VFMA(LDK(KP923879532), T1x, T1w);
Chris@10 136 T1e = VFMA(LDK(KP923879532), TL, TE);
Chris@10 137 TM = VFNMS(LDK(KP923879532), TL, TE);
Chris@10 138 T1r = VZMUL(T1o, VFMAI(T1q, T1p));
Chris@10 139 T1R = VZMUL(T1Q, VFNMSI(T1q, T1p));
Chris@10 140 T14 = VZMUL(TX, VSUB(T10, T13));
Chris@10 141 T1j = VADD(T10, T13);
Chris@10 142 T1b = VFMA(LDK(KP707106781), Tw, Tv);
Chris@10 143 Tx = VFNMS(LDK(KP707106781), Tw, Tv);
Chris@10 144 T1a = VFMA(LDK(KP707106781), Tn, T8);
Chris@10 145 To = VFNMS(LDK(KP707106781), Tn, T8);
Chris@10 146 T1T = VZMULI(T1S, VFMAI(T1M, T1L));
Chris@10 147 T1N = VZMULI(T1K, VFNMSI(T1M, T1L));
Chris@10 148 T16 = VZMULI(T15, VFMAI(TT, TM));
Chris@10 149 TU = VZMULI(Tz, VFNMSI(TT, TM));
Chris@10 150 T1l = VZMULI(T1k, VFMAI(T1f, T1e));
Chris@10 151 T1g = VZMULI(T1d, VFNMSI(T1f, T1e));
Chris@10 152 T1D = VZMUL(T1C, VFMAI(Tx, To));
Chris@10 153 Ty = VZMUL(T1, VFNMSI(Tx, To));
Chris@10 154 T1J = VZMUL(T1I, VFMAI(T1b, T1a));
Chris@10 155 T1c = VZMUL(T19, VFNMSI(T1b, T1a));
Chris@10 156 T1U = VCONJ(VSUB(T1R, T1T));
Chris@10 157 T1V = VADD(T1R, T1T);
Chris@10 158 T1m = VCONJ(VSUB(T1j, T1l));
Chris@10 159 T1n = VADD(T1j, T1l);
Chris@10 160 T1z = VZMULI(T1s, VFMAI(T1y, T1v));
Chris@10 161 T1E = LDW(&(W[TWVL * 20]));
Chris@10 162 T1O = VCONJ(VSUB(T1J, T1N));
Chris@10 163 T1P = VADD(T1J, T1N);
Chris@10 164 T1h = VCONJ(VSUB(T1c, T1g));
Chris@10 165 T1i = VADD(T1c, T1g);
Chris@10 166 ST(&(Rp[WS(rs, 6)]), T1V, ms, &(Rp[0]));
Chris@10 167 ST(&(Rm[WS(rs, 6)]), T1U, -ms, &(Rm[0]));
Chris@10 168 ST(&(Rp[0]), T1n, ms, &(Rp[0]));
Chris@10 169 ST(&(Rm[0]), T1m, -ms, &(Rm[0]));
Chris@10 170 ST(&(Rp[WS(rs, 1)]), T1P, ms, &(Rp[WS(rs, 1)]));
Chris@10 171 ST(&(Rm[WS(rs, 1)]), T1O, -ms, &(Rm[WS(rs, 1)]));
Chris@10 172 ST(&(Rp[WS(rs, 7)]), T1i, ms, &(Rp[WS(rs, 1)]));
Chris@10 173 ST(&(Rm[WS(rs, 7)]), T1h, -ms, &(Rm[WS(rs, 1)]));
Chris@10 174 T1F = VZMULI(T1E, VFNMSI(T1y, T1v));
Chris@10 175 }
Chris@10 176 TV = VCONJ(VSUB(Ty, TU));
Chris@10 177 TW = VADD(Ty, TU);
Chris@10 178 T17 = VCONJ(VSUB(T14, T16));
Chris@10 179 T18 = VADD(T14, T16);
Chris@10 180 T1B = VADD(T1r, T1z);
Chris@10 181 T1A = VCONJ(VSUB(T1r, T1z));
Chris@10 182 }
Chris@10 183 T1H = VADD(T1D, T1F);
Chris@10 184 T1G = VCONJ(VSUB(T1D, T1F));
Chris@10 185 ST(&(Rm[WS(rs, 3)]), TV, -ms, &(Rm[WS(rs, 1)]));
Chris@10 186 ST(&(Rp[WS(rs, 3)]), TW, ms, &(Rp[WS(rs, 1)]));
Chris@10 187 ST(&(Rm[WS(rs, 4)]), T17, -ms, &(Rm[0]));
Chris@10 188 ST(&(Rm[WS(rs, 2)]), T1A, -ms, &(Rm[0]));
Chris@10 189 ST(&(Rp[WS(rs, 2)]), T1B, ms, &(Rp[0]));
Chris@10 190 ST(&(Rp[WS(rs, 4)]), T18, ms, &(Rp[0]));
Chris@10 191 ST(&(Rp[WS(rs, 5)]), T1H, ms, &(Rp[WS(rs, 1)]));
Chris@10 192 ST(&(Rm[WS(rs, 5)]), T1G, -ms, &(Rm[WS(rs, 1)]));
Chris@10 193 }
Chris@10 194 }
Chris@10 195 VLEAVE();
Chris@10 196 }
Chris@10 197
Chris@10 198 static const tw_instr twinstr[] = {
Chris@10 199 VTW(1, 1),
Chris@10 200 VTW(1, 2),
Chris@10 201 VTW(1, 3),
Chris@10 202 VTW(1, 4),
Chris@10 203 VTW(1, 5),
Chris@10 204 VTW(1, 6),
Chris@10 205 VTW(1, 7),
Chris@10 206 VTW(1, 8),
Chris@10 207 VTW(1, 9),
Chris@10 208 VTW(1, 10),
Chris@10 209 VTW(1, 11),
Chris@10 210 VTW(1, 12),
Chris@10 211 VTW(1, 13),
Chris@10 212 VTW(1, 14),
Chris@10 213 VTW(1, 15),
Chris@10 214 {TW_NEXT, VL, 0}
Chris@10 215 };
Chris@10 216
Chris@10 217 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, {53, 30, 50, 0} };
Chris@10 218
Chris@10 219 void XSIMD(codelet_hc2cbdftv_16) (planner *p) {
Chris@10 220 X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT);
Chris@10 221 }
Chris@10 222 #else /* HAVE_FMA */
Chris@10 223
Chris@10 224 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dif -sign 1 -name hc2cbdftv_16 -include hc2cbv.h */
Chris@10 225
Chris@10 226 /*
Chris@10 227 * This function contains 103 FP additions, 42 FP multiplications,
Chris@10 228 * (or, 99 additions, 38 multiplications, 4 fused multiply/add),
Chris@10 229 * 83 stack variables, 3 constants, and 32 memory accesses
Chris@10 230 */
Chris@10 231 #include "hc2cbv.h"
Chris@10 232
Chris@10 233 static void hc2cbdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 234 {
Chris@10 235 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 236 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 237 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 238 {
Chris@10 239 INT m;
Chris@10 240 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 241 V Tf, T16, TZ, T1C, TI, T1a, TV, T1D, T1F, T1G, Ty, T19, TC, T17, TS;
Chris@10 242 V T10;
Chris@10 243 {
Chris@10 244 V T2, TD, T4, TF, Tc, Tb, Td, T6, T8, T9, T3, TE, Ta, T7, T5;
Chris@10 245 V Te, TX, TY, TG, TH, TT, TU, Tj, TM, Tw, TQ, Tn, TN, Ts, TP;
Chris@10 246 V Tg, Ti, Th, Tt, Tv, Tu, Tk, Tm, Tl, Tr, Tq, Tp, To, Tx, TA;
Chris@10 247 V TB, TO, TR;
Chris@10 248 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@10 249 TD = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@10 250 T3 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 251 T4 = VCONJ(T3);
Chris@10 252 TE = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 253 TF = VCONJ(TE);
Chris@10 254 Tc = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
Chris@10 255 Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 256 Tb = VCONJ(Ta);
Chris@10 257 Td = VSUB(Tb, Tc);
Chris@10 258 T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@10 259 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 260 T8 = VCONJ(T7);
Chris@10 261 T9 = VSUB(T6, T8);
Chris@10 262 T5 = VSUB(T2, T4);
Chris@10 263 Te = VMUL(LDK(KP707106781), VADD(T9, Td));
Chris@10 264 Tf = VADD(T5, Te);
Chris@10 265 T16 = VSUB(T5, Te);
Chris@10 266 TX = VADD(T2, T4);
Chris@10 267 TY = VADD(TD, TF);
Chris@10 268 TZ = VSUB(TX, TY);
Chris@10 269 T1C = VADD(TX, TY);
Chris@10 270 TG = VSUB(TD, TF);
Chris@10 271 TH = VMUL(LDK(KP707106781), VSUB(T9, Td));
Chris@10 272 TI = VADD(TG, TH);
Chris@10 273 T1a = VSUB(TH, TG);
Chris@10 274 TT = VADD(T6, T8);
Chris@10 275 TU = VADD(Tb, Tc);
Chris@10 276 TV = VSUB(TT, TU);
Chris@10 277 T1D = VADD(TT, TU);
Chris@10 278 Tg = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 279 Th = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
Chris@10 280 Ti = VCONJ(Th);
Chris@10 281 Tj = VSUB(Tg, Ti);
Chris@10 282 TM = VADD(Tg, Ti);
Chris@10 283 Tt = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 284 Tu = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@10 285 Tv = VCONJ(Tu);
Chris@10 286 Tw = VSUB(Tt, Tv);
Chris@10 287 TQ = VADD(Tt, Tv);
Chris@10 288 Tk = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 289 Tl = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@10 290 Tm = VCONJ(Tl);
Chris@10 291 Tn = VSUB(Tk, Tm);
Chris@10 292 TN = VADD(Tk, Tm);
Chris@10 293 Tr = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 294 Tp = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@10 295 Tq = VCONJ(Tp);
Chris@10 296 Ts = VSUB(Tq, Tr);
Chris@10 297 TP = VADD(Tq, Tr);
Chris@10 298 T1F = VADD(TM, TN);
Chris@10 299 T1G = VADD(TP, TQ);
Chris@10 300 To = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj));
Chris@10 301 Tx = VFMA(LDK(KP923879532), Ts, VMUL(LDK(KP382683432), Tw));
Chris@10 302 Ty = VADD(To, Tx);
Chris@10 303 T19 = VSUB(To, Tx);
Chris@10 304 TA = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn));
Chris@10 305 TB = VFNMS(LDK(KP382683432), Ts, VMUL(LDK(KP923879532), Tw));
Chris@10 306 TC = VADD(TA, TB);
Chris@10 307 T17 = VSUB(TA, TB);
Chris@10 308 TO = VSUB(TM, TN);
Chris@10 309 TR = VSUB(TP, TQ);
Chris@10 310 TS = VMUL(LDK(KP707106781), VSUB(TO, TR));
Chris@10 311 T10 = VMUL(LDK(KP707106781), VADD(TO, TR));
Chris@10 312 }
Chris@10 313 {
Chris@10 314 V T21, T1W, T1u, T20, T1I, T1O, TK, T1S, T12, T1e, T1k, T1A, T1o, T1w, T1c;
Chris@10 315 V T1M, T1U, T1V, T1T, T1s, T1t, T1r, T1Z, T1E, T1H, T1B, T1N, Tz, TJ, T1;
Chris@10 316 V T1R, TW, T11, TL, T1d, T1i, T1j, T1h, T1z, T1m, T1n, T1l, T1v, T18, T1b;
Chris@10 317 V T15, T1L, T13, T1g, T1X, T23, T14, T1f, T1Y, T22, T1p, T1y, T1J, T1Q, T1q;
Chris@10 318 V T1x, T1K, T1P;
Chris@10 319 T1U = VADD(T1C, T1D);
Chris@10 320 T1V = VADD(T1F, T1G);
Chris@10 321 T21 = VADD(T1U, T1V);
Chris@10 322 T1T = LDW(&(W[TWVL * 14]));
Chris@10 323 T1W = VZMUL(T1T, VSUB(T1U, T1V));
Chris@10 324 T1s = VADD(Tf, Ty);
Chris@10 325 T1t = VBYI(VADD(TI, TC));
Chris@10 326 T1r = LDW(&(W[TWVL * 28]));
Chris@10 327 T1u = VZMULI(T1r, VSUB(T1s, T1t));
Chris@10 328 T1Z = LDW(&(W[0]));
Chris@10 329 T20 = VZMULI(T1Z, VADD(T1s, T1t));
Chris@10 330 T1E = VSUB(T1C, T1D);
Chris@10 331 T1H = VBYI(VSUB(T1F, T1G));
Chris@10 332 T1B = LDW(&(W[TWVL * 22]));
Chris@10 333 T1I = VZMUL(T1B, VSUB(T1E, T1H));
Chris@10 334 T1N = LDW(&(W[TWVL * 6]));
Chris@10 335 T1O = VZMUL(T1N, VADD(T1E, T1H));
Chris@10 336 Tz = VSUB(Tf, Ty);
Chris@10 337 TJ = VBYI(VSUB(TC, TI));
Chris@10 338 T1 = LDW(&(W[TWVL * 12]));
Chris@10 339 TK = VZMULI(T1, VADD(Tz, TJ));
Chris@10 340 T1R = LDW(&(W[TWVL * 16]));
Chris@10 341 T1S = VZMULI(T1R, VSUB(Tz, TJ));
Chris@10 342 TW = VBYI(VSUB(TS, TV));
Chris@10 343 T11 = VSUB(TZ, T10);
Chris@10 344 TL = LDW(&(W[TWVL * 10]));
Chris@10 345 T12 = VZMUL(TL, VADD(TW, T11));
Chris@10 346 T1d = LDW(&(W[TWVL * 18]));
Chris@10 347 T1e = VZMUL(T1d, VSUB(T11, TW));
Chris@10 348 T1i = VBYI(VADD(T1a, T19));
Chris@10 349 T1j = VADD(T16, T17);
Chris@10 350 T1h = LDW(&(W[TWVL * 4]));
Chris@10 351 T1k = VZMULI(T1h, VADD(T1i, T1j));
Chris@10 352 T1z = LDW(&(W[TWVL * 24]));
Chris@10 353 T1A = VZMULI(T1z, VSUB(T1j, T1i));
Chris@10 354 T1m = VBYI(VADD(TV, TS));
Chris@10 355 T1n = VADD(TZ, T10);
Chris@10 356 T1l = LDW(&(W[TWVL * 2]));
Chris@10 357 T1o = VZMUL(T1l, VADD(T1m, T1n));
Chris@10 358 T1v = LDW(&(W[TWVL * 26]));
Chris@10 359 T1w = VZMUL(T1v, VSUB(T1n, T1m));
Chris@10 360 T18 = VSUB(T16, T17);
Chris@10 361 T1b = VBYI(VSUB(T19, T1a));
Chris@10 362 T15 = LDW(&(W[TWVL * 20]));
Chris@10 363 T1c = VZMULI(T15, VSUB(T18, T1b));
Chris@10 364 T1L = LDW(&(W[TWVL * 8]));
Chris@10 365 T1M = VZMULI(T1L, VADD(T1b, T18));
Chris@10 366 T13 = VADD(TK, T12);
Chris@10 367 ST(&(Rp[WS(rs, 3)]), T13, ms, &(Rp[WS(rs, 1)]));
Chris@10 368 T1g = VCONJ(VSUB(T1e, T1c));
Chris@10 369 ST(&(Rm[WS(rs, 5)]), T1g, -ms, &(Rm[WS(rs, 1)]));
Chris@10 370 T1X = VADD(T1S, T1W);
Chris@10 371 ST(&(Rp[WS(rs, 4)]), T1X, ms, &(Rp[0]));
Chris@10 372 T23 = VCONJ(VSUB(T21, T20));
Chris@10 373 ST(&(Rm[0]), T23, -ms, &(Rm[0]));
Chris@10 374 T14 = VCONJ(VSUB(T12, TK));
Chris@10 375 ST(&(Rm[WS(rs, 3)]), T14, -ms, &(Rm[WS(rs, 1)]));
Chris@10 376 T1f = VADD(T1c, T1e);
Chris@10 377 ST(&(Rp[WS(rs, 5)]), T1f, ms, &(Rp[WS(rs, 1)]));
Chris@10 378 T1Y = VCONJ(VSUB(T1W, T1S));
Chris@10 379 ST(&(Rm[WS(rs, 4)]), T1Y, -ms, &(Rm[0]));
Chris@10 380 T22 = VADD(T20, T21);
Chris@10 381 ST(&(Rp[0]), T22, ms, &(Rp[0]));
Chris@10 382 T1p = VADD(T1k, T1o);
Chris@10 383 ST(&(Rp[WS(rs, 1)]), T1p, ms, &(Rp[WS(rs, 1)]));
Chris@10 384 T1y = VCONJ(VSUB(T1w, T1u));
Chris@10 385 ST(&(Rm[WS(rs, 7)]), T1y, -ms, &(Rm[WS(rs, 1)]));
Chris@10 386 T1J = VADD(T1A, T1I);
Chris@10 387 ST(&(Rp[WS(rs, 6)]), T1J, ms, &(Rp[0]));
Chris@10 388 T1Q = VCONJ(VSUB(T1O, T1M));
Chris@10 389 ST(&(Rm[WS(rs, 2)]), T1Q, -ms, &(Rm[0]));
Chris@10 390 T1q = VCONJ(VSUB(T1o, T1k));
Chris@10 391 ST(&(Rm[WS(rs, 1)]), T1q, -ms, &(Rm[WS(rs, 1)]));
Chris@10 392 T1x = VADD(T1u, T1w);
Chris@10 393 ST(&(Rp[WS(rs, 7)]), T1x, ms, &(Rp[WS(rs, 1)]));
Chris@10 394 T1K = VCONJ(VSUB(T1I, T1A));
Chris@10 395 ST(&(Rm[WS(rs, 6)]), T1K, -ms, &(Rm[0]));
Chris@10 396 T1P = VADD(T1M, T1O);
Chris@10 397 ST(&(Rp[WS(rs, 2)]), T1P, ms, &(Rp[0]));
Chris@10 398 }
Chris@10 399 }
Chris@10 400 }
Chris@10 401 VLEAVE();
Chris@10 402 }
Chris@10 403
Chris@10 404 static const tw_instr twinstr[] = {
Chris@10 405 VTW(1, 1),
Chris@10 406 VTW(1, 2),
Chris@10 407 VTW(1, 3),
Chris@10 408 VTW(1, 4),
Chris@10 409 VTW(1, 5),
Chris@10 410 VTW(1, 6),
Chris@10 411 VTW(1, 7),
Chris@10 412 VTW(1, 8),
Chris@10 413 VTW(1, 9),
Chris@10 414 VTW(1, 10),
Chris@10 415 VTW(1, 11),
Chris@10 416 VTW(1, 12),
Chris@10 417 VTW(1, 13),
Chris@10 418 VTW(1, 14),
Chris@10 419 VTW(1, 15),
Chris@10 420 {TW_NEXT, VL, 0}
Chris@10 421 };
Chris@10 422
Chris@10 423 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cbdftv_16"), twinstr, &GENUS, {99, 38, 4, 0} };
Chris@10 424
Chris@10 425 void XSIMD(codelet_hc2cbdftv_16) (planner *p) {
Chris@10 426 X(khc2c_register) (p, hc2cbdftv_16, &desc, HC2C_VIA_DFT);
Chris@10 427 }
Chris@10 428 #endif /* HAVE_FMA */