annotate src/fftw-3.3.5/dft/simd/common/q1bv_4.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:45:28 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1bv_4 -include q1b.h -sign 1 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 44 FP additions, 32 FP multiplications,
Chris@42 32 * (or, 36 additions, 24 multiplications, 8 fused multiply/add),
Chris@42 33 * 38 stack variables, 0 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "q1b.h"
Chris@42 36
Chris@42 37 static void q1bv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 {
Chris@42 40 INT m;
Chris@42 41 R *x;
Chris@42 42 x = ii;
Chris@42 43 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
Chris@42 44 V Tb, Tm, Tx, TI;
Chris@42 45 {
Chris@42 46 V Tc, T9, T3, TG, TA, TH, TD, Ta, T6, Td, Tn, To, Tq, Tr, Tf;
Chris@42 47 V Tg;
Chris@42 48 {
Chris@42 49 V T1, T2, Ty, Tz, TB, TC, T4, T5;
Chris@42 50 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 51 T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 52 Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@42 53 Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@42 54 TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 55 TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 56 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 57 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 58 Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@42 59 T9 = VADD(T1, T2);
Chris@42 60 T3 = VSUB(T1, T2);
Chris@42 61 TG = VADD(Ty, Tz);
Chris@42 62 TA = VSUB(Ty, Tz);
Chris@42 63 TH = VADD(TB, TC);
Chris@42 64 TD = VSUB(TB, TC);
Chris@42 65 Ta = VADD(T4, T5);
Chris@42 66 T6 = VSUB(T4, T5);
Chris@42 67 Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@42 68 Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@42 69 To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@42 70 Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 71 Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 72 Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 73 Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 74 }
Chris@42 75 {
Chris@42 76 V Tk, Te, Tv, Tp, Tw, Ts, Tl, Th, T7, TE, Tu, TF;
Chris@42 77 ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
Chris@42 78 Tk = VADD(Tc, Td);
Chris@42 79 Te = VSUB(Tc, Td);
Chris@42 80 Tv = VADD(Tn, To);
Chris@42 81 Tp = VSUB(Tn, To);
Chris@42 82 Tw = VADD(Tq, Tr);
Chris@42 83 Ts = VSUB(Tq, Tr);
Chris@42 84 Tl = VADD(Tf, Tg);
Chris@42 85 Th = VSUB(Tf, Tg);
Chris@42 86 ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
Chris@42 87 T7 = BYTW(&(W[TWVL * 4]), VFNMSI(T6, T3));
Chris@42 88 TE = BYTW(&(W[TWVL * 4]), VFNMSI(TD, TA));
Chris@42 89 {
Chris@42 90 V Tt, Ti, Tj, T8;
Chris@42 91 T8 = BYTW(&(W[0]), VFMAI(T6, T3));
Chris@42 92 ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
Chris@42 93 Tt = BYTW(&(W[TWVL * 4]), VFNMSI(Ts, Tp));
Chris@42 94 ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
Chris@42 95 Ti = BYTW(&(W[TWVL * 4]), VFNMSI(Th, Te));
Chris@42 96 Tj = BYTW(&(W[0]), VFMAI(Th, Te));
Chris@42 97 ST(&(x[WS(vs, 3)]), T7, ms, &(x[WS(vs, 3)]));
Chris@42 98 ST(&(x[WS(vs, 3) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 99 ST(&(x[WS(vs, 1)]), T8, ms, &(x[WS(vs, 1)]));
Chris@42 100 Tu = BYTW(&(W[0]), VFMAI(Ts, Tp));
Chris@42 101 ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 3)]));
Chris@42 102 TF = BYTW(&(W[0]), VFMAI(TD, TA));
Chris@42 103 ST(&(x[WS(vs, 3) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 104 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 105 }
Chris@42 106 Tb = BYTW(&(W[TWVL * 2]), VSUB(T9, Ta));
Chris@42 107 Tm = BYTW(&(W[TWVL * 2]), VSUB(Tk, Tl));
Chris@42 108 Tx = BYTW(&(W[TWVL * 2]), VSUB(Tv, Tw));
Chris@42 109 ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 1)]));
Chris@42 110 TI = BYTW(&(W[TWVL * 2]), VSUB(TG, TH));
Chris@42 111 ST(&(x[WS(vs, 1) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 112 }
Chris@42 113 }
Chris@42 114 ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
Chris@42 115 ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 116 ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
Chris@42 117 ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 118 }
Chris@42 119 }
Chris@42 120 VLEAVE();
Chris@42 121 }
Chris@42 122
Chris@42 123 static const tw_instr twinstr[] = {
Chris@42 124 VTW(0, 1),
Chris@42 125 VTW(0, 2),
Chris@42 126 VTW(0, 3),
Chris@42 127 {TW_NEXT, VL, 0}
Chris@42 128 };
Chris@42 129
Chris@42 130 static const ct_desc desc = { 4, XSIMD_STRING("q1bv_4"), twinstr, &GENUS, {36, 24, 8, 0}, 0, 0, 0 };
Chris@42 131
Chris@42 132 void XSIMD(codelet_q1bv_4) (planner *p) {
Chris@42 133 X(kdft_difsq_register) (p, q1bv_4, &desc);
Chris@42 134 }
Chris@42 135 #else /* HAVE_FMA */
Chris@42 136
Chris@42 137 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1bv_4 -include q1b.h -sign 1 */
Chris@42 138
Chris@42 139 /*
Chris@42 140 * This function contains 44 FP additions, 24 FP multiplications,
Chris@42 141 * (or, 44 additions, 24 multiplications, 0 fused multiply/add),
Chris@42 142 * 22 stack variables, 0 constants, and 32 memory accesses
Chris@42 143 */
Chris@42 144 #include "q1b.h"
Chris@42 145
Chris@42 146 static void q1bv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@42 147 {
Chris@42 148 {
Chris@42 149 INT m;
Chris@42 150 R *x;
Chris@42 151 x = ii;
Chris@42 152 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
Chris@42 153 V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
Chris@42 154 V Tl;
Chris@42 155 {
Chris@42 156 V T1, T2, Ty, Tz;
Chris@42 157 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 158 T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 159 T3 = VSUB(T1, T2);
Chris@42 160 T9 = VADD(T1, T2);
Chris@42 161 Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@42 162 Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@42 163 TA = VSUB(Ty, Tz);
Chris@42 164 TG = VADD(Ty, Tz);
Chris@42 165 }
Chris@42 166 {
Chris@42 167 V TB, TC, T4, T5;
Chris@42 168 TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 169 TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 170 TD = VBYI(VSUB(TB, TC));
Chris@42 171 TH = VADD(TB, TC);
Chris@42 172 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 173 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 174 T6 = VBYI(VSUB(T4, T5));
Chris@42 175 Ta = VADD(T4, T5);
Chris@42 176 }
Chris@42 177 {
Chris@42 178 V Tc, Td, Tn, To;
Chris@42 179 Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@42 180 Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@42 181 Te = VSUB(Tc, Td);
Chris@42 182 Tk = VADD(Tc, Td);
Chris@42 183 Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@42 184 To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@42 185 Tp = VSUB(Tn, To);
Chris@42 186 Tv = VADD(Tn, To);
Chris@42 187 }
Chris@42 188 {
Chris@42 189 V Tq, Tr, Tf, Tg;
Chris@42 190 Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 191 Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 192 Ts = VBYI(VSUB(Tq, Tr));
Chris@42 193 Tw = VADD(Tq, Tr);
Chris@42 194 Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 195 Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 196 Th = VBYI(VSUB(Tf, Tg));
Chris@42 197 Tl = VADD(Tf, Tg);
Chris@42 198 }
Chris@42 199 ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
Chris@42 200 ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
Chris@42 201 ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
Chris@42 202 ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
Chris@42 203 {
Chris@42 204 V T7, Ti, Tt, TE;
Chris@42 205 T7 = BYTW(&(W[TWVL * 4]), VSUB(T3, T6));
Chris@42 206 ST(&(x[WS(vs, 3)]), T7, ms, &(x[WS(vs, 3)]));
Chris@42 207 Ti = BYTW(&(W[TWVL * 4]), VSUB(Te, Th));
Chris@42 208 ST(&(x[WS(vs, 3) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 209 Tt = BYTW(&(W[TWVL * 4]), VSUB(Tp, Ts));
Chris@42 210 ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 3)]));
Chris@42 211 TE = BYTW(&(W[TWVL * 4]), VSUB(TA, TD));
Chris@42 212 ST(&(x[WS(vs, 3) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 213 }
Chris@42 214 {
Chris@42 215 V T8, Tj, Tu, TF;
Chris@42 216 T8 = BYTW(&(W[0]), VADD(T3, T6));
Chris@42 217 ST(&(x[WS(vs, 1)]), T8, ms, &(x[WS(vs, 1)]));
Chris@42 218 Tj = BYTW(&(W[0]), VADD(Te, Th));
Chris@42 219 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 220 Tu = BYTW(&(W[0]), VADD(Tp, Ts));
Chris@42 221 ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 1)]));
Chris@42 222 TF = BYTW(&(W[0]), VADD(TA, TD));
Chris@42 223 ST(&(x[WS(vs, 1) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 224 }
Chris@42 225 {
Chris@42 226 V Tb, Tm, Tx, TI;
Chris@42 227 Tb = BYTW(&(W[TWVL * 2]), VSUB(T9, Ta));
Chris@42 228 ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
Chris@42 229 Tm = BYTW(&(W[TWVL * 2]), VSUB(Tk, Tl));
Chris@42 230 ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 231 Tx = BYTW(&(W[TWVL * 2]), VSUB(Tv, Tw));
Chris@42 232 ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
Chris@42 233 TI = BYTW(&(W[TWVL * 2]), VSUB(TG, TH));
Chris@42 234 ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 235 }
Chris@42 236 }
Chris@42 237 }
Chris@42 238 VLEAVE();
Chris@42 239 }
Chris@42 240
Chris@42 241 static const tw_instr twinstr[] = {
Chris@42 242 VTW(0, 1),
Chris@42 243 VTW(0, 2),
Chris@42 244 VTW(0, 3),
Chris@42 245 {TW_NEXT, VL, 0}
Chris@42 246 };
Chris@42 247
Chris@42 248 static const ct_desc desc = { 4, XSIMD_STRING("q1bv_4"), twinstr, &GENUS, {44, 24, 0, 0}, 0, 0, 0 };
Chris@42 249
Chris@42 250 void XSIMD(codelet_q1bv_4) (planner *p) {
Chris@42 251 X(kdft_difsq_register) (p, q1bv_4, &desc);
Chris@42 252 }
Chris@42 253 #endif /* HAVE_FMA */