annotate src/fftw-3.3.3/rdft/simd/common/hc2cbdftv_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:42:29 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-rdft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dif -sign 1 -name hc2cbdftv_10 -include hc2cbv.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 61 FP additions, 50 FP multiplications,
Chris@10 32 * (or, 33 additions, 22 multiplications, 28 fused multiply/add),
Chris@10 33 * 76 stack variables, 4 constants, and 20 memory accesses
Chris@10 34 */
Chris@10 35 #include "hc2cbv.h"
Chris@10 36
Chris@10 37 static void hc2cbdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 40 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 41 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@10 42 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 43 {
Chris@10 44 INT m;
Chris@10 45 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@10 46 V Ts, T4, TR, T1, TZ, TD, Ty, Tn, Ti, TT, T11, TJ, T15, Tr, TN;
Chris@10 47 V TE, Tv, To, Tb, T8, Tw, Te, Tx, Th, Tt, T7, T9, T2, T3, Tc;
Chris@10 48 V Td, Tf, Tg, T5, T6, Tu, Ta;
Chris@10 49 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@10 50 T3 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@10 51 Tc = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@10 52 Td = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@10 53 Tf = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 54 Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 55 T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@10 56 T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@10 57 T8 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 58 Ts = VFMACONJ(T3, T2);
Chris@10 59 T4 = VFNMSCONJ(T3, T2);
Chris@10 60 Tw = VFMACONJ(Td, Tc);
Chris@10 61 Te = VFNMSCONJ(Td, Tc);
Chris@10 62 Tx = VFMACONJ(Tg, Tf);
Chris@10 63 Th = VFMSCONJ(Tg, Tf);
Chris@10 64 Tt = VFMACONJ(T6, T5);
Chris@10 65 T7 = VFNMSCONJ(T6, T5);
Chris@10 66 T9 = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 67 TR = LDW(&(W[TWVL * 8]));
Chris@10 68 T1 = LDW(&(W[TWVL * 4]));
Chris@10 69 TZ = LDW(&(W[TWVL * 12]));
Chris@10 70 TD = VSUB(Tw, Tx);
Chris@10 71 Ty = VADD(Tw, Tx);
Chris@10 72 Tn = VSUB(Te, Th);
Chris@10 73 Ti = VADD(Te, Th);
Chris@10 74 Tu = VFMACONJ(T9, T8);
Chris@10 75 Ta = VFMSCONJ(T9, T8);
Chris@10 76 TT = LDW(&(W[TWVL * 6]));
Chris@10 77 T11 = LDW(&(W[TWVL * 10]));
Chris@10 78 TJ = LDW(&(W[TWVL * 16]));
Chris@10 79 T15 = LDW(&(W[0]));
Chris@10 80 Tr = LDW(&(W[TWVL * 2]));
Chris@10 81 TN = LDW(&(W[TWVL * 14]));
Chris@10 82 TE = VSUB(Tt, Tu);
Chris@10 83 Tv = VADD(Tt, Tu);
Chris@10 84 To = VSUB(T7, Ta);
Chris@10 85 Tb = VADD(T7, Ta);
Chris@10 86 {
Chris@10 87 V TV, TF, Tz, TB, TL, Tp, Tj, Tl, T17, TA, TS, Tk, TC, TU, TK;
Chris@10 88 V Tm, TO, TG, T12, TW, T16, TM, T10, Tq, TX, TY, T18, T19, TQ, TP;
Chris@10 89 V T13, T14, TI, TH;
Chris@10 90 TV = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), TD, TE));
Chris@10 91 TF = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), TE, TD));
Chris@10 92 Tz = VADD(Tv, Ty);
Chris@10 93 TB = VSUB(Tv, Ty);
Chris@10 94 TL = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Tn, To));
Chris@10 95 Tp = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), To, Tn));
Chris@10 96 Tj = VADD(Tb, Ti);
Chris@10 97 Tl = VSUB(Tb, Ti);
Chris@10 98 T17 = VADD(Ts, Tz);
Chris@10 99 TA = VFNMS(LDK(KP250000000), Tz, Ts);
Chris@10 100 TS = VZMULI(TR, VADD(T4, Tj));
Chris@10 101 Tk = VFNMS(LDK(KP250000000), Tj, T4);
Chris@10 102 TC = VFNMS(LDK(KP559016994), TB, TA);
Chris@10 103 TU = VFMA(LDK(KP559016994), TB, TA);
Chris@10 104 TK = VFMA(LDK(KP559016994), Tl, Tk);
Chris@10 105 Tm = VFNMS(LDK(KP559016994), Tl, Tk);
Chris@10 106 TO = VZMUL(TN, VFMAI(TF, TC));
Chris@10 107 TG = VZMUL(Tr, VFNMSI(TF, TC));
Chris@10 108 T12 = VZMUL(T11, VFMAI(TV, TU));
Chris@10 109 TW = VZMUL(TT, VFNMSI(TV, TU));
Chris@10 110 T16 = VZMULI(T15, VFMAI(TL, TK));
Chris@10 111 TM = VZMULI(TJ, VFNMSI(TL, TK));
Chris@10 112 T10 = VZMULI(TZ, VFNMSI(Tp, Tm));
Chris@10 113 Tq = VZMULI(T1, VFMAI(Tp, Tm));
Chris@10 114 TX = VADD(TS, TW);
Chris@10 115 TY = VCONJ(VSUB(TW, TS));
Chris@10 116 T18 = VADD(T16, T17);
Chris@10 117 T19 = VCONJ(VSUB(T17, T16));
Chris@10 118 TQ = VCONJ(VSUB(TO, TM));
Chris@10 119 TP = VADD(TM, TO);
Chris@10 120 T13 = VADD(T10, T12);
Chris@10 121 T14 = VCONJ(VSUB(T12, T10));
Chris@10 122 TI = VCONJ(VSUB(TG, Tq));
Chris@10 123 TH = VADD(Tq, TG);
Chris@10 124 ST(&(Rp[WS(rs, 2)]), TX, ms, &(Rp[0]));
Chris@10 125 ST(&(Rm[WS(rs, 2)]), TY, -ms, &(Rm[0]));
Chris@10 126 ST(&(Rp[0]), T18, ms, &(Rp[0]));
Chris@10 127 ST(&(Rm[0]), T19, -ms, &(Rm[0]));
Chris@10 128 ST(&(Rm[WS(rs, 4)]), TQ, -ms, &(Rm[0]));
Chris@10 129 ST(&(Rp[WS(rs, 4)]), TP, ms, &(Rp[0]));
Chris@10 130 ST(&(Rp[WS(rs, 3)]), T13, ms, &(Rp[WS(rs, 1)]));
Chris@10 131 ST(&(Rm[WS(rs, 3)]), T14, -ms, &(Rm[WS(rs, 1)]));
Chris@10 132 ST(&(Rm[WS(rs, 1)]), TI, -ms, &(Rm[WS(rs, 1)]));
Chris@10 133 ST(&(Rp[WS(rs, 1)]), TH, ms, &(Rp[WS(rs, 1)]));
Chris@10 134 }
Chris@10 135 }
Chris@10 136 }
Chris@10 137 VLEAVE();
Chris@10 138 }
Chris@10 139
Chris@10 140 static const tw_instr twinstr[] = {
Chris@10 141 VTW(1, 1),
Chris@10 142 VTW(1, 2),
Chris@10 143 VTW(1, 3),
Chris@10 144 VTW(1, 4),
Chris@10 145 VTW(1, 5),
Chris@10 146 VTW(1, 6),
Chris@10 147 VTW(1, 7),
Chris@10 148 VTW(1, 8),
Chris@10 149 VTW(1, 9),
Chris@10 150 {TW_NEXT, VL, 0}
Chris@10 151 };
Chris@10 152
Chris@10 153 static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cbdftv_10"), twinstr, &GENUS, {33, 22, 28, 0} };
Chris@10 154
Chris@10 155 void XSIMD(codelet_hc2cbdftv_10) (planner *p) {
Chris@10 156 X(khc2c_register) (p, hc2cbdftv_10, &desc, HC2C_VIA_DFT);
Chris@10 157 }
Chris@10 158 #else /* HAVE_FMA */
Chris@10 159
Chris@10 160 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 10 -dif -sign 1 -name hc2cbdftv_10 -include hc2cbv.h */
Chris@10 161
Chris@10 162 /*
Chris@10 163 * This function contains 61 FP additions, 30 FP multiplications,
Chris@10 164 * (or, 55 additions, 24 multiplications, 6 fused multiply/add),
Chris@10 165 * 81 stack variables, 4 constants, and 20 memory accesses
Chris@10 166 */
Chris@10 167 #include "hc2cbv.h"
Chris@10 168
Chris@10 169 static void hc2cbdftv_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 170 {
Chris@10 171 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@10 172 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@10 173 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@10 174 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@10 175 {
Chris@10 176 INT m;
Chris@10 177 for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 18)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 18), MAKE_VOLATILE_STRIDE(40, rs)) {
Chris@10 178 V T5, TE, Ts, Tt, TC, Tz, TH, TJ, To, Tq, T2, T4, T3, T9, Tx;
Chris@10 179 V Tm, TB, Td, Ty, Ti, TA, T6, T8, T7, Tl, Tk, Tj, Tc, Tb, Ta;
Chris@10 180 V Tf, Th, Tg, TF, TG, Te, Tn;
Chris@10 181 T2 = LD(&(Rp[0]), ms, &(Rp[0]));
Chris@10 182 T3 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
Chris@10 183 T4 = VCONJ(T3);
Chris@10 184 T5 = VSUB(T2, T4);
Chris@10 185 TE = VADD(T2, T4);
Chris@10 186 T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
Chris@10 187 T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
Chris@10 188 T8 = VCONJ(T7);
Chris@10 189 T9 = VSUB(T6, T8);
Chris@10 190 Tx = VADD(T6, T8);
Chris@10 191 Tl = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 192 Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 193 Tk = VCONJ(Tj);
Chris@10 194 Tm = VSUB(Tk, Tl);
Chris@10 195 TB = VADD(Tk, Tl);
Chris@10 196 Tc = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
Chris@10 197 Ta = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
Chris@10 198 Tb = VCONJ(Ta);
Chris@10 199 Td = VSUB(Tb, Tc);
Chris@10 200 Ty = VADD(Tb, Tc);
Chris@10 201 Tf = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
Chris@10 202 Tg = LD(&(Rm[0]), -ms, &(Rm[0]));
Chris@10 203 Th = VCONJ(Tg);
Chris@10 204 Ti = VSUB(Tf, Th);
Chris@10 205 TA = VADD(Tf, Th);
Chris@10 206 Ts = VSUB(T9, Td);
Chris@10 207 Tt = VSUB(Ti, Tm);
Chris@10 208 TC = VSUB(TA, TB);
Chris@10 209 Tz = VSUB(Tx, Ty);
Chris@10 210 TF = VADD(Tx, Ty);
Chris@10 211 TG = VADD(TA, TB);
Chris@10 212 TH = VADD(TF, TG);
Chris@10 213 TJ = VMUL(LDK(KP559016994), VSUB(TF, TG));
Chris@10 214 Te = VADD(T9, Td);
Chris@10 215 Tn = VADD(Ti, Tm);
Chris@10 216 To = VADD(Te, Tn);
Chris@10 217 Tq = VMUL(LDK(KP559016994), VSUB(Te, Tn));
Chris@10 218 {
Chris@10 219 V T1c, TX, Tv, T1b, TR, T15, TL, T17, TT, T11, TW, Tu, TQ, Tr, TP;
Chris@10 220 V Tp, T1, T1a, TO, T14, TD, T10, TK, TZ, TI, Tw, T16, TS, TY, TM;
Chris@10 221 V TU, T1e, TN, T1d, T19, T13, TV, T18, T12;
Chris@10 222 T1c = VADD(TE, TH);
Chris@10 223 TW = LDW(&(W[TWVL * 8]));
Chris@10 224 TX = VZMULI(TW, VADD(T5, To));
Chris@10 225 Tu = VBYI(VFNMS(LDK(KP951056516), Tt, VMUL(LDK(KP587785252), Ts)));
Chris@10 226 TQ = VBYI(VFMA(LDK(KP951056516), Ts, VMUL(LDK(KP587785252), Tt)));
Chris@10 227 Tp = VFNMS(LDK(KP250000000), To, T5);
Chris@10 228 Tr = VSUB(Tp, Tq);
Chris@10 229 TP = VADD(Tq, Tp);
Chris@10 230 T1 = LDW(&(W[TWVL * 4]));
Chris@10 231 Tv = VZMULI(T1, VSUB(Tr, Tu));
Chris@10 232 T1a = LDW(&(W[0]));
Chris@10 233 T1b = VZMULI(T1a, VADD(TQ, TP));
Chris@10 234 TO = LDW(&(W[TWVL * 16]));
Chris@10 235 TR = VZMULI(TO, VSUB(TP, TQ));
Chris@10 236 T14 = LDW(&(W[TWVL * 12]));
Chris@10 237 T15 = VZMULI(T14, VADD(Tu, Tr));
Chris@10 238 TD = VBYI(VFNMS(LDK(KP951056516), TC, VMUL(LDK(KP587785252), Tz)));
Chris@10 239 T10 = VBYI(VFMA(LDK(KP951056516), Tz, VMUL(LDK(KP587785252), TC)));
Chris@10 240 TI = VFNMS(LDK(KP250000000), TH, TE);
Chris@10 241 TK = VSUB(TI, TJ);
Chris@10 242 TZ = VADD(TJ, TI);
Chris@10 243 Tw = LDW(&(W[TWVL * 2]));
Chris@10 244 TL = VZMUL(Tw, VADD(TD, TK));
Chris@10 245 T16 = LDW(&(W[TWVL * 10]));
Chris@10 246 T17 = VZMUL(T16, VADD(T10, TZ));
Chris@10 247 TS = LDW(&(W[TWVL * 14]));
Chris@10 248 TT = VZMUL(TS, VSUB(TK, TD));
Chris@10 249 TY = LDW(&(W[TWVL * 6]));
Chris@10 250 T11 = VZMUL(TY, VSUB(TZ, T10));
Chris@10 251 TM = VADD(Tv, TL);
Chris@10 252 ST(&(Rp[WS(rs, 1)]), TM, ms, &(Rp[WS(rs, 1)]));
Chris@10 253 TU = VADD(TR, TT);
Chris@10 254 ST(&(Rp[WS(rs, 4)]), TU, ms, &(Rp[0]));
Chris@10 255 T1e = VCONJ(VSUB(T1c, T1b));
Chris@10 256 ST(&(Rm[0]), T1e, -ms, &(Rm[0]));
Chris@10 257 TN = VCONJ(VSUB(TL, Tv));
Chris@10 258 ST(&(Rm[WS(rs, 1)]), TN, -ms, &(Rm[WS(rs, 1)]));
Chris@10 259 T1d = VADD(T1b, T1c);
Chris@10 260 ST(&(Rp[0]), T1d, ms, &(Rp[0]));
Chris@10 261 T19 = VCONJ(VSUB(T17, T15));
Chris@10 262 ST(&(Rm[WS(rs, 3)]), T19, -ms, &(Rm[WS(rs, 1)]));
Chris@10 263 T13 = VCONJ(VSUB(T11, TX));
Chris@10 264 ST(&(Rm[WS(rs, 2)]), T13, -ms, &(Rm[0]));
Chris@10 265 TV = VCONJ(VSUB(TT, TR));
Chris@10 266 ST(&(Rm[WS(rs, 4)]), TV, -ms, &(Rm[0]));
Chris@10 267 T18 = VADD(T15, T17);
Chris@10 268 ST(&(Rp[WS(rs, 3)]), T18, ms, &(Rp[WS(rs, 1)]));
Chris@10 269 T12 = VADD(TX, T11);
Chris@10 270 ST(&(Rp[WS(rs, 2)]), T12, ms, &(Rp[0]));
Chris@10 271 }
Chris@10 272 }
Chris@10 273 }
Chris@10 274 VLEAVE();
Chris@10 275 }
Chris@10 276
Chris@10 277 static const tw_instr twinstr[] = {
Chris@10 278 VTW(1, 1),
Chris@10 279 VTW(1, 2),
Chris@10 280 VTW(1, 3),
Chris@10 281 VTW(1, 4),
Chris@10 282 VTW(1, 5),
Chris@10 283 VTW(1, 6),
Chris@10 284 VTW(1, 7),
Chris@10 285 VTW(1, 8),
Chris@10 286 VTW(1, 9),
Chris@10 287 {TW_NEXT, VL, 0}
Chris@10 288 };
Chris@10 289
Chris@10 290 static const hc2c_desc desc = { 10, XSIMD_STRING("hc2cbdftv_10"), twinstr, &GENUS, {55, 24, 6, 0} };
Chris@10 291
Chris@10 292 void XSIMD(codelet_hc2cbdftv_10) (planner *p) {
Chris@10 293 X(khc2c_register) (p, hc2cbdftv_10, &desc, HC2C_VIA_DFT);
Chris@10 294 }
Chris@10 295 #endif /* HAVE_FMA */