annotate src/fftw-3.3.8/rdft/simd/common/hc2cfdftv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:08:11 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dit -name hc2cfdftv_10 -include rdft/simd/hc2cfv.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 61 FP additions, 60 FP multiplications,
Chris@82 32 * (or, 33 additions, 32 multiplications, 28 fused multiply/add),
Chris@82 33 * 77 stack variables, 5 constants, and 20 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/simd/hc2cfv.h"
Chris@82 36
Chris@82 37 static void hc2cfdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 43 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 44 {
Chris@82 45 INT m;
Chris@82 46 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 47 V T8, T11, T12, TG, TH, TP, Tp, TA, TB, TS, TV, TW, TC, TX, TI;
Chris@82 48 V TM, TF, TL, TD, TE, TJ, TO, TK, TN, T13, T17, T10, T16, TY, TZ;
Chris@82 49 V T14, T19, T15, T18;
Chris@82 50 {
Chris@82 51 V T3, To, TU, Th, TT, TR, Tz, Tu, TQ, T7, T1, T2, Tw, T5, T6;
Chris@82 52 V Tr, Tc, Tj, Tg, Ty, Tn, Tt, Tv, Tq, Ta, Tb, T9, Ti, Te, Tf;
Chris@82 53 V Td, Tx, Tl, Tm, Tk, Ts, T4;
Chris@82 54 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@82 55 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@82 56 Tv = LDW(&(W[0]));
Chris@82 57 Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
Chris@82 58 T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@82 59 T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@82 60 Tq = LDW(&(W[TWVL * 6]));
Chris@82 61 Tr = VZMULJ(Tq, VFMACONJ(T6, T5));
Chris@82 62 Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 63 Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 64 T9 = LDW(&(W[TWVL * 2]));
Chris@82 65 Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
Chris@82 66 Ti = LDW(&(W[TWVL * 4]));
Chris@82 67 Tj = VZMULIJ(Ti, VFNMSCONJ(Tb, Ta));
Chris@82 68 Te = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 69 Tf = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 70 Td = LDW(&(W[TWVL * 12]));
Chris@82 71 Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
Chris@82 72 Tx = LDW(&(W[TWVL * 10]));
Chris@82 73 Ty = VZMULJ(Tx, VFMACONJ(Tf, Te));
Chris@82 74 Tl = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@82 75 Tm = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@82 76 Tk = LDW(&(W[TWVL * 14]));
Chris@82 77 Tn = VZMULJ(Tk, VFMACONJ(Tm, Tl));
Chris@82 78 Ts = LDW(&(W[TWVL * 16]));
Chris@82 79 Tt = VZMULIJ(Ts, VFNMSCONJ(Tm, Tl));
Chris@82 80 T3 = VFMACONJ(T2, T1);
Chris@82 81 To = VSUB(Tj, Tn);
Chris@82 82 TU = VADD(Tr, Tt);
Chris@82 83 Th = VSUB(Tc, Tg);
Chris@82 84 TT = VADD(Tw, Ty);
Chris@82 85 TR = VADD(Tj, Tn);
Chris@82 86 Tz = VSUB(Tw, Ty);
Chris@82 87 Tu = VSUB(Tr, Tt);
Chris@82 88 TQ = VADD(Tc, Tg);
Chris@82 89 T4 = LDW(&(W[TWVL * 8]));
Chris@82 90 T7 = VZMULIJ(T4, VFNMSCONJ(T6, T5));
Chris@82 91 T8 = VSUB(T3, T7);
Chris@82 92 T11 = VSUB(TQ, TR);
Chris@82 93 T12 = VSUB(TU, TT);
Chris@82 94 TG = VADD(Tz, Tu);
Chris@82 95 TH = VADD(Th, To);
Chris@82 96 TP = VADD(T3, T7);
Chris@82 97 Tp = VSUB(Th, To);
Chris@82 98 TA = VSUB(Tu, Tz);
Chris@82 99 TB = VADD(Tp, TA);
Chris@82 100 TS = VADD(TQ, TR);
Chris@82 101 TV = VADD(TT, TU);
Chris@82 102 TW = VADD(TS, TV);
Chris@82 103 }
Chris@82 104 TC = VMUL(LDK(KP500000000), VADD(T8, TB));
Chris@82 105 ST(&(Rp[0]), TC, ms, &(Rp[0]));
Chris@82 106 TX = VCONJ(VMUL(LDK(KP500000000), VADD(TP, TW)));
Chris@82 107 ST(&(Rm[WS(rs, 4)]), TX, -ms, &(Rm[0]));
Chris@82 108 TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TH, TG));
Chris@82 109 TM = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TG, TH));
Chris@82 110 TD = VFNMS(LDK(KP250000000), TB, T8);
Chris@82 111 TE = VSUB(Tp, TA);
Chris@82 112 TF = VFNMS(LDK(KP559016994), TE, TD);
Chris@82 113 TL = VFMA(LDK(KP559016994), TE, TD);
Chris@82 114 TJ = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TI, TF)));
Chris@82 115 ST(&(Rm[WS(rs, 1)]), TJ, -ms, &(Rm[WS(rs, 1)]));
Chris@82 116 TO = VMUL(LDK(KP500000000), VFMAI(TM, TL));
Chris@82 117 ST(&(Rp[WS(rs, 4)]), TO, ms, &(Rp[0]));
Chris@82 118 TK = VMUL(LDK(KP500000000), VFMAI(TI, TF));
Chris@82 119 ST(&(Rp[WS(rs, 2)]), TK, ms, &(Rp[0]));
Chris@82 120 TN = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TM, TL)));
Chris@82 121 ST(&(Rm[WS(rs, 3)]), TN, -ms, &(Rm[WS(rs, 1)]));
Chris@82 122 T13 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T12, T11));
Chris@82 123 T17 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T11, T12));
Chris@82 124 TY = VFNMS(LDK(KP250000000), TW, TP);
Chris@82 125 TZ = VSUB(TS, TV);
Chris@82 126 T10 = VFMA(LDK(KP559016994), TZ, TY);
Chris@82 127 T16 = VFNMS(LDK(KP559016994), TZ, TY);
Chris@82 128 T14 = VMUL(LDK(KP500000000), VFNMSI(T13, T10));
Chris@82 129 ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
Chris@82 130 T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16)));
Chris@82 131 ST(&(Rm[WS(rs, 2)]), T19, -ms, &(Rm[0]));
Chris@82 132 T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, T10)));
Chris@82 133 ST(&(Rm[0]), T15, -ms, &(Rm[0]));
Chris@82 134 T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16));
Chris@82 135 ST(&(Rp[WS(rs, 3)]), T18, ms, &(Rp[WS(rs, 1)]));
Chris@82 136 }
Chris@82 137 }
Chris@82 138 VLEAVE();
Chris@82 139 }
Chris@82 140
Chris@82 141 static const tw_instr twinstr[] = {
Chris@82 142 VTW(1, 1),
Chris@82 143 VTW(1, 2),
Chris@82 144 VTW(1, 3),
Chris@82 145 VTW(1, 4),
Chris@82 146 VTW(1, 5),
Chris@82 147 VTW(1, 6),
Chris@82 148 VTW(1, 7),
Chris@82 149 VTW(1, 8),
Chris@82 150 VTW(1, 9),
Chris@82 151 {TW_NEXT, VL, 0}
Chris@82 152 };
Chris@82 153
Chris@82 154 static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cfdftv_10"), twinstr, &GENUS, {33, 32, 28, 0} };
Chris@82 155
Chris@82 156 void XSIMD(codelet_hc2cfdftv_10) (planner *p) {
Chris@82 157 X(khc2c_register) (p, hc2cfdftv_10, &desc, HC2C_VIA_DFT);
Chris@82 158 }
Chris@82 159 #else
Chris@82 160
Chris@82 161 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dit -name hc2cfdftv_10 -include rdft/simd/hc2cfv.h */
Chris@82 162
Chris@82 163 /*
Chris@82 164 * This function contains 61 FP additions, 38 FP multiplications,
Chris@82 165 * (or, 55 additions, 32 multiplications, 6 fused multiply/add),
Chris@82 166 * 82 stack variables, 5 constants, and 20 memory accesses
Chris@82 167 */
Chris@82 168 #include "rdft/simd/hc2cfv.h"
Chris@82 169
Chris@82 170 static void hc2cfdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 171 {
Chris@82 172 DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
Chris@82 173 DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
Chris@82 174 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 175 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 176 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 177 {
Chris@82 178 INT m;
Chris@82 179 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@82 180 V Tl, Tt, Tu, TY, TZ, T10, Tz, TE, TF, TV, TW, TX, Ta, TU, TN;
Chris@82 181 V TR, TH, TQ, TK, TL, TM, TI, TG, TJ, TT, TO, TP, TS, T18, T1c;
Chris@82 182 V T12, T1b, T15, T16, T17, T14, T11, T13, T1e, T19, T1a, T1d;
Chris@82 183 {
Chris@82 184 V T1, T3, Ty, T8, T7, TB, Tf, Ts, Tk, Tw, Tq, TD, T2, Tx, T6;
Chris@82 185 V TA, Tc, Te, Td, Tb, Tr, Tj, Ti, Th, Tg, Tv, Tn, Tp, To, Tm;
Chris@82 186 V TC, T4, T9, T5;
Chris@82 187 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@82 188 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@82 189 T3 = VCONJ(T2);
Chris@82 190 Tx = LDW(&(W[0]));
Chris@82 191 Ty = VZMULIJ(Tx, VSUB(T3, T1));
Chris@82 192 T8 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@82 193 T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@82 194 T7 = VCONJ(T6);
Chris@82 195 TA = LDW(&(W[TWVL * 6]));
Chris@82 196 TB = VZMULJ(TA, VADD(T7, T8));
Chris@82 197 Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 198 Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 199 Te = VCONJ(Td);
Chris@82 200 Tb = LDW(&(W[TWVL * 2]));
Chris@82 201 Tf = VZMULJ(Tb, VADD(Tc, Te));
Chris@82 202 Tr = LDW(&(W[TWVL * 4]));
Chris@82 203 Ts = VZMULIJ(Tr, VSUB(Te, Tc));
Chris@82 204 Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@82 205 Th = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@82 206 Ti = VCONJ(Th);
Chris@82 207 Tg = LDW(&(W[TWVL * 12]));
Chris@82 208 Tk = VZMULIJ(Tg, VSUB(Ti, Tj));
Chris@82 209 Tv = LDW(&(W[TWVL * 10]));
Chris@82 210 Tw = VZMULJ(Tv, VADD(Ti, Tj));
Chris@82 211 Tn = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@82 212 To = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@82 213 Tp = VCONJ(To);
Chris@82 214 Tm = LDW(&(W[TWVL * 14]));
Chris@82 215 Tq = VZMULJ(Tm, VADD(Tn, Tp));
Chris@82 216 TC = LDW(&(W[TWVL * 16]));
Chris@82 217 TD = VZMULIJ(TC, VSUB(Tp, Tn));
Chris@82 218 Tl = VSUB(Tf, Tk);
Chris@82 219 Tt = VSUB(Tq, Ts);
Chris@82 220 Tu = VADD(Tl, Tt);
Chris@82 221 TY = VADD(Ty, Tw);
Chris@82 222 TZ = VADD(TB, TD);
Chris@82 223 T10 = VADD(TY, TZ);
Chris@82 224 Tz = VSUB(Tw, Ty);
Chris@82 225 TE = VSUB(TB, TD);
Chris@82 226 TF = VADD(Tz, TE);
Chris@82 227 TV = VADD(Tf, Tk);
Chris@82 228 TW = VADD(Ts, Tq);
Chris@82 229 TX = VADD(TV, TW);
Chris@82 230 T4 = VADD(T1, T3);
Chris@82 231 T5 = LDW(&(W[TWVL * 8]));
Chris@82 232 T9 = VZMULIJ(T5, VSUB(T7, T8));
Chris@82 233 Ta = VSUB(T4, T9);
Chris@82 234 TU = VADD(T4, T9);
Chris@82 235 }
Chris@82 236 TL = VSUB(Tl, Tt);
Chris@82 237 TM = VSUB(TE, Tz);
Chris@82 238 TN = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM))));
Chris@82 239 TR = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), TL, VMUL(LDK(KP951056516), TM))));
Chris@82 240 TI = VMUL(LDK(KP279508497), VSUB(Tu, TF));
Chris@82 241 TG = VADD(Tu, TF);
Chris@82 242 TJ = VFNMS(LDK(KP125000000), TG, VMUL(LDK(KP500000000), Ta));
Chris@82 243 TH = VCONJ(VMUL(LDK(KP500000000), VADD(Ta, TG)));
Chris@82 244 TQ = VSUB(TJ, TI);
Chris@82 245 TK = VADD(TI, TJ);
Chris@82 246 ST(&(Rm[WS(rs, 4)]), TH, -ms, &(Rm[0]));
Chris@82 247 TT = VCONJ(VADD(TQ, TR));
Chris@82 248 ST(&(Rm[WS(rs, 2)]), TT, -ms, &(Rm[0]));
Chris@82 249 TO = VSUB(TK, TN);
Chris@82 250 ST(&(Rp[WS(rs, 1)]), TO, ms, &(Rp[WS(rs, 1)]));
Chris@82 251 TP = VCONJ(VADD(TK, TN));
Chris@82 252 ST(&(Rm[0]), TP, -ms, &(Rm[0]));
Chris@82 253 TS = VSUB(TQ, TR);
Chris@82 254 ST(&(Rp[WS(rs, 3)]), TS, ms, &(Rp[WS(rs, 1)]));
Chris@82 255 T16 = VSUB(TZ, TY);
Chris@82 256 T17 = VSUB(TV, TW);
Chris@82 257 T18 = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T17, VMUL(LDK(KP951056516), T16))));
Chris@82 258 T1c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T16))));
Chris@82 259 T14 = VMUL(LDK(KP279508497), VSUB(TX, T10));
Chris@82 260 T11 = VADD(TX, T10);
Chris@82 261 T13 = VFNMS(LDK(KP125000000), T11, VMUL(LDK(KP500000000), TU));
Chris@82 262 T12 = VMUL(LDK(KP500000000), VADD(TU, T11));
Chris@82 263 T1b = VADD(T14, T13);
Chris@82 264 T15 = VSUB(T13, T14);
Chris@82 265 ST(&(Rp[0]), T12, ms, &(Rp[0]));
Chris@82 266 T1e = VADD(T1b, T1c);
Chris@82 267 ST(&(Rp[WS(rs, 4)]), T1e, ms, &(Rp[0]));
Chris@82 268 T19 = VCONJ(VSUB(T15, T18));
Chris@82 269 ST(&(Rm[WS(rs, 1)]), T19, -ms, &(Rm[WS(rs, 1)]));
Chris@82 270 T1a = VADD(T15, T18);
Chris@82 271 ST(&(Rp[WS(rs, 2)]), T1a, ms, &(Rp[0]));
Chris@82 272 T1d = VCONJ(VSUB(T1b, T1c));
Chris@82 273 ST(&(Rm[WS(rs, 3)]), T1d, -ms, &(Rm[WS(rs, 1)]));
Chris@82 274 }
Chris@82 275 }
Chris@82 276 VLEAVE();
Chris@82 277 }
Chris@82 278
Chris@82 279 static const tw_instr twinstr[] = {
Chris@82 280 VTW(1, 1),
Chris@82 281 VTW(1, 2),
Chris@82 282 VTW(1, 3),
Chris@82 283 VTW(1, 4),
Chris@82 284 VTW(1, 5),
Chris@82 285 VTW(1, 6),
Chris@82 286 VTW(1, 7),
Chris@82 287 VTW(1, 8),
Chris@82 288 VTW(1, 9),
Chris@82 289 {TW_NEXT, VL, 0}
Chris@82 290 };
Chris@82 291
Chris@82 292 static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cfdftv_10"), twinstr, &GENUS, {55, 32, 6, 0} };
Chris@82 293
Chris@82 294 void XSIMD(codelet_hc2cfdftv_10) (planner *p) {
Chris@82 295 X(khc2c_register) (p, hc2cfdftv_10, &desc, HC2C_VIA_DFT);
Chris@82 296 }
Chris@82 297 #endif