annotate src/fftw-3.3.8/rdft/simd/common/hc2cbdftv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:08:11 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dif -sign 1 -name hc2cbdftv_8 -include rdft/simd/hc2cbv.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 41 FP additions, 32 FP multiplications,
Chris@82 32 * (or, 23 additions, 14 multiplications, 18 fused multiply/add),
Chris@82 33 * 51 stack variables, 1 constants, and 16 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/simd/hc2cbv.h"
Chris@82 36
Chris@82 37 static void hc2cbdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 43 V Tm, Tp, TF, TE, Th, Tv, Tc, Tu, T4, Tk, Tf, Tl, T7, Tn, Ta;
Chris@82 44 V To, T2, T3, Td, Te, T5, T6, T8, T9, Tg, Tb, TL, TK, TJ, TM;
Chris@82 45 V TN, TC, TG, TB, TD, TH, TI, Ti, Tq, T1, Tj, Tr, Ts, Tw, Ty;
Chris@82 46 V Tt, Tx, Tz, TA;
Chris@82 47 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@82 48 T3 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 49 T4 = VFNMSCONJ(T3, T2);
Chris@82 50 Tk = VFMACONJ(T3, T2);
Chris@82 51 Td = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@82 52 Te = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 53 Tf = VFNMSCONJ(Te, Td);
Chris@82 54 Tl = VFMACONJ(Te, Td);
Chris@82 55 T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 56 T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@82 57 T7 = VFNMSCONJ(T6, T5);
Chris@82 58 Tn = VFMACONJ(T6, T5);
Chris@82 59 T8 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 60 T9 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@82 61 Ta = VFMSCONJ(T9, T8);
Chris@82 62 To = VFMACONJ(T9, T8);
Chris@82 63 Tm = VSUB(Tk, Tl);
Chris@82 64 Tp = VSUB(Tn, To);
Chris@82 65 TF = VADD(Tn, To);
Chris@82 66 TE = VADD(Tk, Tl);
Chris@82 67 Tg = VSUB(T7, Ta);
Chris@82 68 Th = VFMA(LDK(KP707106781), Tg, Tf);
Chris@82 69 Tv = VFNMS(LDK(KP707106781), Tg, Tf);
Chris@82 70 Tb = VADD(T7, Ta);
Chris@82 71 Tc = VFMA(LDK(KP707106781), Tb, T4);
Chris@82 72 Tu = VFNMS(LDK(KP707106781), Tb, T4);
Chris@82 73 TL = VADD(TE, TF);
Chris@82 74 TJ = LDW(&(W[0]));
Chris@82 75 TK = VZMULI(TJ, VFMAI(Th, Tc));
Chris@82 76 TM = VADD(TK, TL);
Chris@82 77 ST(&(Rp[0]), TM, ms, &(Rp[0]));
Chris@82 78 TN = VCONJ(VSUB(TL, TK));
Chris@82 79 ST(&(Rm[0]), TN, -ms, &(Rm[0]));
Chris@82 80 TB = LDW(&(W[TWVL * 8]));
Chris@82 81 TC = VZMULI(TB, VFMAI(Tv, Tu));
Chris@82 82 TD = LDW(&(W[TWVL * 6]));
Chris@82 83 TG = VZMUL(TD, VSUB(TE, TF));
Chris@82 84 TH = VADD(TC, TG);
Chris@82 85 ST(&(Rp[WS(rs, 2)]), TH, ms, &(Rp[0]));
Chris@82 86 TI = VCONJ(VSUB(TG, TC));
Chris@82 87 ST(&(Rm[WS(rs, 2)]), TI, -ms, &(Rm[0]));
Chris@82 88 T1 = LDW(&(W[TWVL * 12]));
Chris@82 89 Ti = VZMULI(T1, VFNMSI(Th, Tc));
Chris@82 90 Tj = LDW(&(W[TWVL * 10]));
Chris@82 91 Tq = VZMUL(Tj, VFNMSI(Tp, Tm));
Chris@82 92 Tr = VADD(Ti, Tq);
Chris@82 93 ST(&(Rp[WS(rs, 3)]), Tr, ms, &(Rp[WS(rs, 1)]));
Chris@82 94 Ts = VCONJ(VSUB(Tq, Ti));
Chris@82 95 ST(&(Rm[WS(rs, 3)]), Ts, -ms, &(Rm[WS(rs, 1)]));
Chris@82 96 Tt = LDW(&(W[TWVL * 4]));
Chris@82 97 Tw = VZMULI(Tt, VFNMSI(Tv, Tu));
Chris@82 98 Tx = LDW(&(W[TWVL * 2]));
Chris@82 99 Ty = VZMUL(Tx, VFMAI(Tp, Tm));
Chris@82 100 Tz = VADD(Tw, Ty);
Chris@82 101 ST(&(Rp[WS(rs, 1)]), Tz, ms, &(Rp[WS(rs, 1)]));
Chris@82 102 TA = VCONJ(VSUB(Ty, Tw));
Chris@82 103 ST(&(Rm[WS(rs, 1)]), TA, -ms, &(Rm[WS(rs, 1)]));
Chris@82 104 }
Chris@82 105 }
Chris@82 106 VLEAVE();
Chris@82 107 }
Chris@82 108
Chris@82 109 static const tw_instr twinstr[] = {
Chris@82 110 VTW(1, 1),
Chris@82 111 VTW(1, 2),
Chris@82 112 VTW(1, 3),
Chris@82 113 VTW(1, 4),
Chris@82 114 VTW(1, 5),
Chris@82 115 VTW(1, 6),
Chris@82 116 VTW(1, 7),
Chris@82 117 {TW_NEXT, VL, 0}
Chris@82 118 };
Chris@82 119
Chris@82 120 static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cbdftv_8"), twinstr, &GENUS, {23, 14, 18, 0} };
Chris@82 121
Chris@82 122 void XSIMD(codelet_hc2cbdftv_8) (planner *p) {
Chris@82 123 X(khc2c_register) (p, hc2cbdftv_8, &desc, HC2C_VIA_DFT);
Chris@82 124 }
Chris@82 125 #else
Chris@82 126
Chris@82 127 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dif -sign 1 -name hc2cbdftv_8 -include rdft/simd/hc2cbv.h */
Chris@82 128
Chris@82 129 /*
Chris@82 130 * This function contains 41 FP additions, 16 FP multiplications,
Chris@82 131 * (or, 41 additions, 16 multiplications, 0 fused multiply/add),
Chris@82 132 * 55 stack variables, 1 constants, and 16 memory accesses
Chris@82 133 */
Chris@82 134 #include "rdft/simd/hc2cbv.h"
Chris@82 135
Chris@82 136 static void hc2cbdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 137 {
Chris@82 138 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 139 {
Chris@82 140 INT m;
Chris@82 141 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 142 V T5, Tj, Tq, TI, Te, Tk, Tt, TJ, T2, Tg, T4, Ti, T3, Th, To;
Chris@82 143 V Tp, T6, Tc, T8, Tb, T7, Ta, T9, Td, Tr, Ts, TP, Tu, Tm, TO;
Chris@82 144 V Tn, Tf, Tl, T1, TN, Tv, TR, Tw, TQ, TC, TK, TA, TG, TB, TH;
Chris@82 145 V Ty, Tz, Tx, TF, TD, TM, TE, TL;
Chris@82 146 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@82 147 Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@82 148 T3 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 149 T4 = VCONJ(T3);
Chris@82 150 Th = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 151 Ti = VCONJ(Th);
Chris@82 152 T5 = VSUB(T2, T4);
Chris@82 153 Tj = VSUB(Tg, Ti);
Chris@82 154 To = VADD(T2, T4);
Chris@82 155 Tp = VADD(Tg, Ti);
Chris@82 156 Tq = VSUB(To, Tp);
Chris@82 157 TI = VADD(To, Tp);
Chris@82 158 T6 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 159 Tc = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 160 T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@82 161 T8 = VCONJ(T7);
Chris@82 162 Ta = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@82 163 Tb = VCONJ(Ta);
Chris@82 164 T9 = VSUB(T6, T8);
Chris@82 165 Td = VSUB(Tb, Tc);
Chris@82 166 Te = VMUL(LDK(KP707106781), VADD(T9, Td));
Chris@82 167 Tk = VMUL(LDK(KP707106781), VSUB(T9, Td));
Chris@82 168 Tr = VADD(T6, T8);
Chris@82 169 Ts = VADD(Tb, Tc);
Chris@82 170 Tt = VBYI(VSUB(Tr, Ts));
Chris@82 171 TJ = VADD(Tr, Ts);
Chris@82 172 TP = VADD(TI, TJ);
Chris@82 173 Tn = LDW(&(W[TWVL * 10]));
Chris@82 174 Tu = VZMUL(Tn, VSUB(Tq, Tt));
Chris@82 175 Tf = VADD(T5, Te);
Chris@82 176 Tl = VBYI(VADD(Tj, Tk));
Chris@82 177 T1 = LDW(&(W[TWVL * 12]));
Chris@82 178 Tm = VZMULI(T1, VSUB(Tf, Tl));
Chris@82 179 TN = LDW(&(W[0]));
Chris@82 180 TO = VZMULI(TN, VADD(Tl, Tf));
Chris@82 181 Tv = VADD(Tm, Tu);
Chris@82 182 ST(&(Rp[WS(rs, 3)]), Tv, ms, &(Rp[WS(rs, 1)]));
Chris@82 183 TR = VCONJ(VSUB(TP, TO));
Chris@82 184 ST(&(Rm[0]), TR, -ms, &(Rm[0]));
Chris@82 185 Tw = VCONJ(VSUB(Tu, Tm));
Chris@82 186 ST(&(Rm[WS(rs, 3)]), Tw, -ms, &(Rm[WS(rs, 1)]));
Chris@82 187 TQ = VADD(TO, TP);
Chris@82 188 ST(&(Rp[0]), TQ, ms, &(Rp[0]));
Chris@82 189 TB = LDW(&(W[TWVL * 2]));
Chris@82 190 TC = VZMUL(TB, VADD(Tq, Tt));
Chris@82 191 TH = LDW(&(W[TWVL * 6]));
Chris@82 192 TK = VZMUL(TH, VSUB(TI, TJ));
Chris@82 193 Ty = VBYI(VSUB(Tk, Tj));
Chris@82 194 Tz = VSUB(T5, Te);
Chris@82 195 Tx = LDW(&(W[TWVL * 4]));
Chris@82 196 TA = VZMULI(Tx, VADD(Ty, Tz));
Chris@82 197 TF = LDW(&(W[TWVL * 8]));
Chris@82 198 TG = VZMULI(TF, VSUB(Tz, Ty));
Chris@82 199 TD = VADD(TA, TC);
Chris@82 200 ST(&(Rp[WS(rs, 1)]), TD, ms, &(Rp[WS(rs, 1)]));
Chris@82 201 TM = VCONJ(VSUB(TK, TG));
Chris@82 202 ST(&(Rm[WS(rs, 2)]), TM, -ms, &(Rm[0]));
Chris@82 203 TE = VCONJ(VSUB(TC, TA));
Chris@82 204 ST(&(Rm[WS(rs, 1)]), TE, -ms, &(Rm[WS(rs, 1)]));
Chris@82 205 TL = VADD(TG, TK);
Chris@82 206 ST(&(Rp[WS(rs, 2)]), TL, ms, &(Rp[0]));
Chris@82 207 }
Chris@82 208 }
Chris@82 209 VLEAVE();
Chris@82 210 }
Chris@82 211
Chris@82 212 static const tw_instr twinstr[] = {
Chris@82 213 VTW(1, 1),
Chris@82 214 VTW(1, 2),
Chris@82 215 VTW(1, 3),
Chris@82 216 VTW(1, 4),
Chris@82 217 VTW(1, 5),
Chris@82 218 VTW(1, 6),
Chris@82 219 VTW(1, 7),
Chris@82 220 {TW_NEXT, VL, 0}
Chris@82 221 };
Chris@82 222
Chris@82 223 static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cbdftv_8"), twinstr, &GENUS, {41, 16, 0, 0} };
Chris@82 224
Chris@82 225 void XSIMD(codelet_hc2cbdftv_8) (planner *p) {
Chris@82 226 X(khc2c_register) (p, hc2cbdftv_8, &desc, HC2C_VIA_DFT);
Chris@82 227 }
Chris@82 228 #endif