annotate src/fftw-3.3.5/rdft/simd/common/hc2cfdftv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:52:40 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dit -name hc2cfdftv_10 -include hc2cfv.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 61 FP additions, 60 FP multiplications,
Chris@42 32 * (or, 33 additions, 32 multiplications, 28 fused multiply/add),
Chris@42 33 * 77 stack variables, 5 constants, and 20 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cfv.h"
Chris@42 36
Chris@42 37 static void hc2cfdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 44 {
Chris@42 45 INT m;
Chris@42 46 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 47 V T5, T6, Tw, Tr, Tc, Tj, Tl, Tm, Tk, Ts, Tg, Ty, T3, T4, T1;
Chris@42 48 V T2, Tv, Tq, Ta, Tb, T9, Ti, Te, Tf, Td, Tx, Tn, Tt, Th, TQ;
Chris@42 49 V TT, Tz, T7, TR, To, Tu, TU;
Chris@42 50 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@42 51 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@42 52 Tv = LDW(&(W[0]));
Chris@42 53 T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@42 54 T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@42 55 Tq = LDW(&(W[TWVL * 6]));
Chris@42 56 Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 57 Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 58 T9 = LDW(&(W[TWVL * 2]));
Chris@42 59 Ti = LDW(&(W[TWVL * 4]));
Chris@42 60 Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
Chris@42 61 Te = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 62 Tf = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 63 Tr = VZMULJ(Tq, VFMACONJ(T6, T5));
Chris@42 64 Td = LDW(&(W[TWVL * 12]));
Chris@42 65 Tx = LDW(&(W[TWVL * 10]));
Chris@42 66 Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
Chris@42 67 Tj = VZMULIJ(Ti, VFNMSCONJ(Tb, Ta));
Chris@42 68 Tl = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@42 69 Tm = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@42 70 Tk = LDW(&(W[TWVL * 14]));
Chris@42 71 Ts = LDW(&(W[TWVL * 16]));
Chris@42 72 Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
Chris@42 73 Ty = VZMULJ(Tx, VFMACONJ(Tf, Te));
Chris@42 74 T3 = VFMACONJ(T2, T1);
Chris@42 75 T4 = LDW(&(W[TWVL * 8]));
Chris@42 76 Tn = VZMULJ(Tk, VFMACONJ(Tm, Tl));
Chris@42 77 Tt = VZMULIJ(Ts, VFNMSCONJ(Tm, Tl));
Chris@42 78 Th = VSUB(Tc, Tg);
Chris@42 79 TQ = VADD(Tc, Tg);
Chris@42 80 TT = VADD(Tw, Ty);
Chris@42 81 Tz = VSUB(Tw, Ty);
Chris@42 82 T7 = VZMULIJ(T4, VFNMSCONJ(T6, T5));
Chris@42 83 TR = VADD(Tj, Tn);
Chris@42 84 To = VSUB(Tj, Tn);
Chris@42 85 Tu = VSUB(Tr, Tt);
Chris@42 86 TU = VADD(Tr, Tt);
Chris@42 87 {
Chris@42 88 V TP, T8, TS, T11, Tp, TH, TA, TG, TV, T12, TE, TB, TM, TI, TZ;
Chris@42 89 V TW, T17, T13, TD, TC, TY, TX, TL, TF, T10, T16, TN, TO, TK, TJ;
Chris@42 90 V T18, T19, T15, T14;
Chris@42 91 TP = VADD(T3, T7);
Chris@42 92 T8 = VSUB(T3, T7);
Chris@42 93 TS = VADD(TQ, TR);
Chris@42 94 T11 = VSUB(TQ, TR);
Chris@42 95 Tp = VSUB(Th, To);
Chris@42 96 TH = VADD(Th, To);
Chris@42 97 TA = VSUB(Tu, Tz);
Chris@42 98 TG = VADD(Tz, Tu);
Chris@42 99 TV = VADD(TT, TU);
Chris@42 100 T12 = VSUB(TU, TT);
Chris@42 101 TE = VSUB(Tp, TA);
Chris@42 102 TB = VADD(Tp, TA);
Chris@42 103 TM = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TG, TH));
Chris@42 104 TI = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TH, TG));
Chris@42 105 TZ = VSUB(TS, TV);
Chris@42 106 TW = VADD(TS, TV);
Chris@42 107 T17 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T11, T12));
Chris@42 108 T13 = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T12, T11));
Chris@42 109 TD = VFNMS(LDK(KP250000000), TB, T8);
Chris@42 110 TC = VMUL(LDK(KP500000000), VADD(T8, TB));
Chris@42 111 TY = VFNMS(LDK(KP250000000), TW, TP);
Chris@42 112 TX = VCONJ(VMUL(LDK(KP500000000), VADD(TP, TW)));
Chris@42 113 TL = VFMA(LDK(KP559016994), TE, TD);
Chris@42 114 TF = VFNMS(LDK(KP559016994), TE, TD);
Chris@42 115 ST(&(Rp[0]), TC, ms, &(Rp[0]));
Chris@42 116 T10 = VFMA(LDK(KP559016994), TZ, TY);
Chris@42 117 T16 = VFNMS(LDK(KP559016994), TZ, TY);
Chris@42 118 ST(&(Rm[WS(rs, 4)]), TX, -ms, &(Rm[0]));
Chris@42 119 TN = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TM, TL)));
Chris@42 120 TO = VMUL(LDK(KP500000000), VFMAI(TM, TL));
Chris@42 121 TK = VMUL(LDK(KP500000000), VFMAI(TI, TF));
Chris@42 122 TJ = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TI, TF)));
Chris@42 123 T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16));
Chris@42 124 T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16)));
Chris@42 125 T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, T10)));
Chris@42 126 T14 = VMUL(LDK(KP500000000), VFNMSI(T13, T10));
Chris@42 127 ST(&(Rm[WS(rs, 3)]), TN, -ms, &(Rm[WS(rs, 1)]));
Chris@42 128 ST(&(Rp[WS(rs, 4)]), TO, ms, &(Rp[0]));
Chris@42 129 ST(&(Rp[WS(rs, 2)]), TK, ms, &(Rp[0]));
Chris@42 130 ST(&(Rm[WS(rs, 1)]), TJ, -ms, &(Rm[WS(rs, 1)]));
Chris@42 131 ST(&(Rp[WS(rs, 3)]), T18, ms, &(Rp[WS(rs, 1)]));
Chris@42 132 ST(&(Rm[WS(rs, 2)]), T19, -ms, &(Rm[0]));
Chris@42 133 ST(&(Rm[0]), T15, -ms, &(Rm[0]));
Chris@42 134 ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
Chris@42 135 }
Chris@42 136 }
Chris@42 137 }
Chris@42 138 VLEAVE();
Chris@42 139 }
Chris@42 140
Chris@42 141 static const tw_instr twinstr[] = {
Chris@42 142 VTW(1, 1),
Chris@42 143 VTW(1, 2),
Chris@42 144 VTW(1, 3),
Chris@42 145 VTW(1, 4),
Chris@42 146 VTW(1, 5),
Chris@42 147 VTW(1, 6),
Chris@42 148 VTW(1, 7),
Chris@42 149 VTW(1, 8),
Chris@42 150 VTW(1, 9),
Chris@42 151 {TW_NEXT, VL, 0}
Chris@42 152 };
Chris@42 153
Chris@42 154 static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cfdftv_10"), twinstr, &GENUS, {33, 32, 28, 0} };
Chris@42 155
Chris@42 156 void XSIMD(codelet_hc2cfdftv_10) (planner *p) {
Chris@42 157 X(khc2c_register) (p, hc2cfdftv_10, &desc, HC2C_VIA_DFT);
Chris@42 158 }
Chris@42 159 #else /* HAVE_FMA */
Chris@42 160
Chris@42 161 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dit -name hc2cfdftv_10 -include hc2cfv.h */
Chris@42 162
Chris@42 163 /*
Chris@42 164 * This function contains 61 FP additions, 38 FP multiplications,
Chris@42 165 * (or, 55 additions, 32 multiplications, 6 fused multiply/add),
Chris@42 166 * 82 stack variables, 5 constants, and 20 memory accesses
Chris@42 167 */
Chris@42 168 #include "hc2cfv.h"
Chris@42 169
Chris@42 170 static void hc2cfdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 171 {
Chris@42 172 DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
Chris@42 173 DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
Chris@42 174 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 175 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 176 DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 177 {
Chris@42 178 INT m;
Chris@42 179 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@42 180 V Tl, Tt, Tu, TY, TZ, T10, Tz, TE, TF, TV, TW, TX, Ta, TU, TN;
Chris@42 181 V TR, TH, TQ, TK, TL, TM, TI, TG, TJ, TT, TO, TP, TS, T18, T1c;
Chris@42 182 V T12, T1b, T15, T16, T17, T14, T11, T13, T1e, T19, T1a, T1d;
Chris@42 183 {
Chris@42 184 V T1, T3, Ty, T8, T7, TB, Tf, Ts, Tk, Tw, Tq, TD, T2, Tx, T6;
Chris@42 185 V TA, Tc, Te, Td, Tb, Tr, Tj, Ti, Th, Tg, Tv, Tn, Tp, To, Tm;
Chris@42 186 V TC, T4, T9, T5;
Chris@42 187 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@42 188 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@42 189 T3 = VCONJ(T2);
Chris@42 190 Tx = LDW(&(W[0]));
Chris@42 191 Ty = VZMULIJ(Tx, VSUB(T3, T1));
Chris@42 192 T8 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@42 193 T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@42 194 T7 = VCONJ(T6);
Chris@42 195 TA = LDW(&(W[TWVL * 6]));
Chris@42 196 TB = VZMULJ(TA, VADD(T7, T8));
Chris@42 197 Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 198 Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 199 Te = VCONJ(Td);
Chris@42 200 Tb = LDW(&(W[TWVL * 2]));
Chris@42 201 Tf = VZMULJ(Tb, VADD(Tc, Te));
Chris@42 202 Tr = LDW(&(W[TWVL * 4]));
Chris@42 203 Ts = VZMULIJ(Tr, VSUB(Te, Tc));
Chris@42 204 Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@42 205 Th = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@42 206 Ti = VCONJ(Th);
Chris@42 207 Tg = LDW(&(W[TWVL * 12]));
Chris@42 208 Tk = VZMULIJ(Tg, VSUB(Ti, Tj));
Chris@42 209 Tv = LDW(&(W[TWVL * 10]));
Chris@42 210 Tw = VZMULJ(Tv, VADD(Ti, Tj));
Chris@42 211 Tn = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@42 212 To = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@42 213 Tp = VCONJ(To);
Chris@42 214 Tm = LDW(&(W[TWVL * 14]));
Chris@42 215 Tq = VZMULJ(Tm, VADD(Tn, Tp));
Chris@42 216 TC = LDW(&(W[TWVL * 16]));
Chris@42 217 TD = VZMULIJ(TC, VSUB(Tp, Tn));
Chris@42 218 Tl = VSUB(Tf, Tk);
Chris@42 219 Tt = VSUB(Tq, Ts);
Chris@42 220 Tu = VADD(Tl, Tt);
Chris@42 221 TY = VADD(Ty, Tw);
Chris@42 222 TZ = VADD(TB, TD);
Chris@42 223 T10 = VADD(TY, TZ);
Chris@42 224 Tz = VSUB(Tw, Ty);
Chris@42 225 TE = VSUB(TB, TD);
Chris@42 226 TF = VADD(Tz, TE);
Chris@42 227 TV = VADD(Tf, Tk);
Chris@42 228 TW = VADD(Ts, Tq);
Chris@42 229 TX = VADD(TV, TW);
Chris@42 230 T4 = VADD(T1, T3);
Chris@42 231 T5 = LDW(&(W[TWVL * 8]));
Chris@42 232 T9 = VZMULIJ(T5, VSUB(T7, T8));
Chris@42 233 Ta = VSUB(T4, T9);
Chris@42 234 TU = VADD(T4, T9);
Chris@42 235 }
Chris@42 236 TL = VSUB(Tl, Tt);
Chris@42 237 TM = VSUB(TE, Tz);
Chris@42 238 TN = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), TL, VMUL(LDK(KP587785252), TM))));
Chris@42 239 TR = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), TL, VMUL(LDK(KP951056516), TM))));
Chris@42 240 TI = VMUL(LDK(KP279508497), VSUB(Tu, TF));
Chris@42 241 TG = VADD(Tu, TF);
Chris@42 242 TJ = VFNMS(LDK(KP125000000), TG, VMUL(LDK(KP500000000), Ta));
Chris@42 243 TH = VCONJ(VMUL(LDK(KP500000000), VADD(Ta, TG)));
Chris@42 244 TQ = VSUB(TJ, TI);
Chris@42 245 TK = VADD(TI, TJ);
Chris@42 246 ST(&(Rm[WS(rs, 4)]), TH, -ms, &(Rm[0]));
Chris@42 247 TT = VCONJ(VADD(TQ, TR));
Chris@42 248 ST(&(Rm[WS(rs, 2)]), TT, -ms, &(Rm[0]));
Chris@42 249 TO = VSUB(TK, TN);
Chris@42 250 ST(&(Rp[WS(rs, 1)]), TO, ms, &(Rp[WS(rs, 1)]));
Chris@42 251 TP = VCONJ(VADD(TK, TN));
Chris@42 252 ST(&(Rm[0]), TP, -ms, &(Rm[0]));
Chris@42 253 TS = VSUB(TQ, TR);
Chris@42 254 ST(&(Rp[WS(rs, 3)]), TS, ms, &(Rp[WS(rs, 1)]));
Chris@42 255 T16 = VSUB(TZ, TY);
Chris@42 256 T17 = VSUB(TV, TW);
Chris@42 257 T18 = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T17, VMUL(LDK(KP951056516), T16))));
Chris@42 258 T1c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T17, VMUL(LDK(KP587785252), T16))));
Chris@42 259 T14 = VMUL(LDK(KP279508497), VSUB(TX, T10));
Chris@42 260 T11 = VADD(TX, T10);
Chris@42 261 T13 = VFNMS(LDK(KP125000000), T11, VMUL(LDK(KP500000000), TU));
Chris@42 262 T12 = VMUL(LDK(KP500000000), VADD(TU, T11));
Chris@42 263 T1b = VADD(T14, T13);
Chris@42 264 T15 = VSUB(T13, T14);
Chris@42 265 ST(&(Rp[0]), T12, ms, &(Rp[0]));
Chris@42 266 T1e = VADD(T1b, T1c);
Chris@42 267 ST(&(Rp[WS(rs, 4)]), T1e, ms, &(Rp[0]));
Chris@42 268 T19 = VCONJ(VSUB(T15, T18));
Chris@42 269 ST(&(Rm[WS(rs, 1)]), T19, -ms, &(Rm[WS(rs, 1)]));
Chris@42 270 T1a = VADD(T15, T18);
Chris@42 271 ST(&(Rp[WS(rs, 2)]), T1a, ms, &(Rp[0]));
Chris@42 272 T1d = VCONJ(VSUB(T1b, T1c));
Chris@42 273 ST(&(Rm[WS(rs, 3)]), T1d, -ms, &(Rm[WS(rs, 1)]));
Chris@42 274 }
Chris@42 275 }
Chris@42 276 VLEAVE();
Chris@42 277 }
Chris@42 278
Chris@42 279 static const tw_instr twinstr[] = {
Chris@42 280 VTW(1, 1),
Chris@42 281 VTW(1, 2),
Chris@42 282 VTW(1, 3),
Chris@42 283 VTW(1, 4),
Chris@42 284 VTW(1, 5),
Chris@42 285 VTW(1, 6),
Chris@42 286 VTW(1, 7),
Chris@42 287 VTW(1, 8),
Chris@42 288 VTW(1, 9),
Chris@42 289 {TW_NEXT, VL, 0}
Chris@42 290 };
Chris@42 291
Chris@42 292 static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cfdftv_10"), twinstr, &GENUS, {55, 32, 6, 0} };
Chris@42 293
Chris@42 294 void XSIMD(codelet_hc2cfdftv_10) (planner *p) {
Chris@42 295 X(khc2c_register) (p, hc2cfdftv_10, &desc, HC2C_VIA_DFT);
Chris@42 296 }
Chris@42 297 #endif /* HAVE_FMA */