annotate src/fftw-3.3.5/rdft/simd/common/hc2cfdftv_6.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:52:40 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dit -name hc2cfdftv_6 -include hc2cfv.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 29 FP additions, 30 FP multiplications,
Chris@42 32 * (or, 17 additions, 18 multiplications, 12 fused multiply/add),
Chris@42 33 * 38 stack variables, 2 constants, and 12 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cfv.h"
Chris@42 36
Chris@42 37 static void hc2cfdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 40 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 41 {
Chris@42 42 INT m;
Chris@42 43 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@42 44 V T5, T6, T3, Tj, T4, T9, Te, Th, T1, T2, Ti, Tc, Td, Tb, Tg;
Chris@42 45 V T7, Ta, Tt, Tk, Tr, T8, Ts, Tf, Tx, Tu, To, Tl, Tw, Tv, Tn;
Chris@42 46 V Tm, Tz, Ty, Tp, Tq;
Chris@42 47 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@42 48 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@42 49 Ti = LDW(&(W[0]));
Chris@42 50 Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@42 51 Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@42 52 Tb = LDW(&(W[TWVL * 8]));
Chris@42 53 Tg = LDW(&(W[TWVL * 6]));
Chris@42 54 T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 55 T6 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 56 T3 = VFMACONJ(T2, T1);
Chris@42 57 Tj = VZMULIJ(Ti, VFNMSCONJ(T2, T1));
Chris@42 58 T4 = LDW(&(W[TWVL * 4]));
Chris@42 59 T9 = LDW(&(W[TWVL * 2]));
Chris@42 60 Te = VZMULIJ(Tb, VFNMSCONJ(Td, Tc));
Chris@42 61 Th = VZMULJ(Tg, VFMACONJ(Td, Tc));
Chris@42 62 T7 = VZMULIJ(T4, VFNMSCONJ(T6, T5));
Chris@42 63 Ta = VZMULJ(T9, VFMACONJ(T6, T5));
Chris@42 64 Tt = VADD(Tj, Th);
Chris@42 65 Tk = VSUB(Th, Tj);
Chris@42 66 Tr = VADD(T3, T7);
Chris@42 67 T8 = VSUB(T3, T7);
Chris@42 68 Ts = VADD(Ta, Te);
Chris@42 69 Tf = VSUB(Ta, Te);
Chris@42 70 Tx = VMUL(LDK(KP866025403), VSUB(Tt, Ts));
Chris@42 71 Tu = VADD(Ts, Tt);
Chris@42 72 To = VMUL(LDK(KP866025403), VSUB(Tk, Tf));
Chris@42 73 Tl = VADD(Tf, Tk);
Chris@42 74 Tw = VFNMS(LDK(KP500000000), Tu, Tr);
Chris@42 75 Tv = VCONJ(VMUL(LDK(KP500000000), VADD(Tr, Tu)));
Chris@42 76 Tn = VFNMS(LDK(KP500000000), Tl, T8);
Chris@42 77 Tm = VMUL(LDK(KP500000000), VADD(T8, Tl));
Chris@42 78 Tz = VMUL(LDK(KP500000000), VFMAI(Tx, Tw));
Chris@42 79 Ty = VCONJ(VMUL(LDK(KP500000000), VFNMSI(Tx, Tw)));
Chris@42 80 ST(&(Rm[WS(rs, 2)]), Tv, -ms, &(Rm[0]));
Chris@42 81 Tp = VMUL(LDK(KP500000000), VFNMSI(To, Tn));
Chris@42 82 Tq = VCONJ(VMUL(LDK(KP500000000), VFMAI(To, Tn)));
Chris@42 83 ST(&(Rp[0]), Tm, ms, &(Rp[0]));
Chris@42 84 ST(&(Rp[WS(rs, 1)]), Tz, ms, &(Rp[WS(rs, 1)]));
Chris@42 85 ST(&(Rm[0]), Ty, -ms, &(Rm[0]));
Chris@42 86 ST(&(Rm[WS(rs, 1)]), Tq, -ms, &(Rm[WS(rs, 1)]));
Chris@42 87 ST(&(Rp[WS(rs, 2)]), Tp, ms, &(Rp[0]));
Chris@42 88 }
Chris@42 89 }
Chris@42 90 VLEAVE();
Chris@42 91 }
Chris@42 92
Chris@42 93 static const tw_instr twinstr[] = {
Chris@42 94 VTW(1, 1),
Chris@42 95 VTW(1, 2),
Chris@42 96 VTW(1, 3),
Chris@42 97 VTW(1, 4),
Chris@42 98 VTW(1, 5),
Chris@42 99 {TW_NEXT, VL, 0}
Chris@42 100 };
Chris@42 101
Chris@42 102 static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cfdftv_6"), twinstr, &GENUS, {17, 18, 12, 0} };
Chris@42 103
Chris@42 104 void XSIMD(codelet_hc2cfdftv_6) (planner *p) {
Chris@42 105 X(khc2c_register) (p, hc2cfdftv_6, &desc, HC2C_VIA_DFT);
Chris@42 106 }
Chris@42 107 #else /* HAVE_FMA */
Chris@42 108
Chris@42 109 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 6 -dit -name hc2cfdftv_6 -include hc2cfv.h */
Chris@42 110
Chris@42 111 /*
Chris@42 112 * This function contains 29 FP additions, 20 FP multiplications,
Chris@42 113 * (or, 27 additions, 18 multiplications, 2 fused multiply/add),
Chris@42 114 * 42 stack variables, 3 constants, and 12 memory accesses
Chris@42 115 */
Chris@42 116 #include "hc2cfv.h"
Chris@42 117
Chris@42 118 static void hc2cfdftv_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 119 {
Chris@42 120 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 121 DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 122 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 123 {
Chris@42 124 INT m;
Chris@42 125 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 10)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 10), MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@42 126 V Ta, Tu, Tn, Tw, Ti, Tv, T1, T8, Tg, Tf, T7, T3, Te, T6, T2;
Chris@42 127 V T4, T9, T5, Tk, Tm, Tj, Tl, Tc, Th, Tb, Td, Tr, Tp, Tq, To;
Chris@42 128 V Tt, Ts, TA, Ty, Tz, Tx, TC, TB;
Chris@42 129 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@42 130 T8 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 131 Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@42 132 Te = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@42 133 Tf = VCONJ(Te);
Chris@42 134 T6 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 135 T7 = VCONJ(T6);
Chris@42 136 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@42 137 T3 = VCONJ(T2);
Chris@42 138 T4 = VADD(T1, T3);
Chris@42 139 T5 = LDW(&(W[TWVL * 4]));
Chris@42 140 T9 = VZMULIJ(T5, VSUB(T7, T8));
Chris@42 141 Ta = VADD(T4, T9);
Chris@42 142 Tu = VSUB(T4, T9);
Chris@42 143 Tj = LDW(&(W[0]));
Chris@42 144 Tk = VZMULIJ(Tj, VSUB(T3, T1));
Chris@42 145 Tl = LDW(&(W[TWVL * 6]));
Chris@42 146 Tm = VZMULJ(Tl, VADD(Tf, Tg));
Chris@42 147 Tn = VADD(Tk, Tm);
Chris@42 148 Tw = VSUB(Tm, Tk);
Chris@42 149 Tb = LDW(&(W[TWVL * 2]));
Chris@42 150 Tc = VZMULJ(Tb, VADD(T7, T8));
Chris@42 151 Td = LDW(&(W[TWVL * 8]));
Chris@42 152 Th = VZMULIJ(Td, VSUB(Tf, Tg));
Chris@42 153 Ti = VADD(Tc, Th);
Chris@42 154 Tv = VSUB(Tc, Th);
Chris@42 155 Tr = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(Tn, Ti))));
Chris@42 156 To = VADD(Ti, Tn);
Chris@42 157 Tp = VMUL(LDK(KP500000000), VADD(Ta, To));
Chris@42 158 Tq = VFNMS(LDK(KP250000000), To, VMUL(LDK(KP500000000), Ta));
Chris@42 159 ST(&(Rp[0]), Tp, ms, &(Rp[0]));
Chris@42 160 Tt = VCONJ(VADD(Tq, Tr));
Chris@42 161 ST(&(Rm[WS(rs, 1)]), Tt, -ms, &(Rm[WS(rs, 1)]));
Chris@42 162 Ts = VSUB(Tq, Tr);
Chris@42 163 ST(&(Rp[WS(rs, 2)]), Ts, ms, &(Rp[0]));
Chris@42 164 TA = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(Tw, Tv))));
Chris@42 165 Tx = VADD(Tv, Tw);
Chris@42 166 Ty = VCONJ(VMUL(LDK(KP500000000), VADD(Tu, Tx)));
Chris@42 167 Tz = VFNMS(LDK(KP250000000), Tx, VMUL(LDK(KP500000000), Tu));
Chris@42 168 ST(&(Rm[WS(rs, 2)]), Ty, -ms, &(Rm[0]));
Chris@42 169 TC = VADD(Tz, TA);
Chris@42 170 ST(&(Rp[WS(rs, 1)]), TC, ms, &(Rp[WS(rs, 1)]));
Chris@42 171 TB = VCONJ(VSUB(Tz, TA));
Chris@42 172 ST(&(Rm[0]), TB, -ms, &(Rm[0]));
Chris@42 173 }
Chris@42 174 }
Chris@42 175 VLEAVE();
Chris@42 176 }
Chris@42 177
Chris@42 178 static const tw_instr twinstr[] = {
Chris@42 179 VTW(1, 1),
Chris@42 180 VTW(1, 2),
Chris@42 181 VTW(1, 3),
Chris@42 182 VTW(1, 4),
Chris@42 183 VTW(1, 5),
Chris@42 184 {TW_NEXT, VL, 0}
Chris@42 185 };
Chris@42 186
Chris@42 187 static const hc2c_desc desc = { 6, XSIMD_STRING("hc2cfdftv_6"), twinstr, &GENUS, {27, 18, 2, 0} };
Chris@42 188
Chris@42 189 void XSIMD(codelet_hc2cfdftv_6) (planner *p) {
Chris@42 190 X(khc2c_register) (p, hc2cfdftv_6, &desc, HC2C_VIA_DFT);
Chris@42 191 }
Chris@42 192 #endif /* HAVE_FMA */