annotate src/fftw-3.3.8/dft/simd/common/q1fv_4.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:13 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1fv_4 -include dft/simd/q1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 44 FP additions, 32 FP multiplications,
Chris@82 32 * (or, 36 additions, 24 multiplications, 8 fused multiply/add),
Chris@82 33 * 22 stack variables, 0 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/q1f.h"
Chris@82 36
Chris@82 37 static void q1fv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 {
Chris@82 40 INT m;
Chris@82 41 R *x;
Chris@82 42 x = ri;
Chris@82 43 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
Chris@82 44 V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
Chris@82 45 V Tl;
Chris@82 46 {
Chris@82 47 V T1, T2, Ty, Tz;
Chris@82 48 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 49 T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 50 T3 = VSUB(T1, T2);
Chris@82 51 T9 = VADD(T1, T2);
Chris@82 52 Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@82 53 Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@82 54 TA = VSUB(Ty, Tz);
Chris@82 55 TG = VADD(Ty, Tz);
Chris@82 56 }
Chris@82 57 {
Chris@82 58 V TB, TC, T4, T5;
Chris@82 59 TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 60 TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 61 TD = VSUB(TB, TC);
Chris@82 62 TH = VADD(TB, TC);
Chris@82 63 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 64 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 65 T6 = VSUB(T4, T5);
Chris@82 66 Ta = VADD(T4, T5);
Chris@82 67 }
Chris@82 68 {
Chris@82 69 V Tc, Td, Tn, To;
Chris@82 70 Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@82 71 Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@82 72 Te = VSUB(Tc, Td);
Chris@82 73 Tk = VADD(Tc, Td);
Chris@82 74 Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 75 To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 76 Tp = VSUB(Tn, To);
Chris@82 77 Tv = VADD(Tn, To);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V Tq, Tr, Tf, Tg;
Chris@82 81 Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 82 Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 83 Ts = VSUB(Tq, Tr);
Chris@82 84 Tw = VADD(Tq, Tr);
Chris@82 85 Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 86 Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 87 Th = VSUB(Tf, Tg);
Chris@82 88 Tl = VADD(Tf, Tg);
Chris@82 89 }
Chris@82 90 ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
Chris@82 91 ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
Chris@82 92 ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
Chris@82 93 ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
Chris@82 94 {
Chris@82 95 V T7, Ti, Tt, TE;
Chris@82 96 T7 = BYTWJ(&(W[0]), VFNMSI(T6, T3));
Chris@82 97 ST(&(x[WS(vs, 1)]), T7, ms, &(x[WS(vs, 1)]));
Chris@82 98 Ti = BYTWJ(&(W[0]), VFNMSI(Th, Te));
Chris@82 99 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 100 Tt = BYTWJ(&(W[0]), VFNMSI(Ts, Tp));
Chris@82 101 ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 1)]));
Chris@82 102 TE = BYTWJ(&(W[0]), VFNMSI(TD, TA));
Chris@82 103 ST(&(x[WS(vs, 1) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 104 }
Chris@82 105 {
Chris@82 106 V T8, Tj, Tu, TF;
Chris@82 107 T8 = BYTWJ(&(W[TWVL * 4]), VFMAI(T6, T3));
Chris@82 108 ST(&(x[WS(vs, 3)]), T8, ms, &(x[WS(vs, 3)]));
Chris@82 109 Tj = BYTWJ(&(W[TWVL * 4]), VFMAI(Th, Te));
Chris@82 110 ST(&(x[WS(vs, 3) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 111 Tu = BYTWJ(&(W[TWVL * 4]), VFMAI(Ts, Tp));
Chris@82 112 ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 3)]));
Chris@82 113 TF = BYTWJ(&(W[TWVL * 4]), VFMAI(TD, TA));
Chris@82 114 ST(&(x[WS(vs, 3) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 115 }
Chris@82 116 {
Chris@82 117 V Tb, Tm, Tx, TI;
Chris@82 118 Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
Chris@82 119 ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
Chris@82 120 Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
Chris@82 121 ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 122 Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
Chris@82 123 ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
Chris@82 124 TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
Chris@82 125 ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 126 }
Chris@82 127 }
Chris@82 128 }
Chris@82 129 VLEAVE();
Chris@82 130 }
Chris@82 131
Chris@82 132 static const tw_instr twinstr[] = {
Chris@82 133 VTW(0, 1),
Chris@82 134 VTW(0, 2),
Chris@82 135 VTW(0, 3),
Chris@82 136 {TW_NEXT, VL, 0}
Chris@82 137 };
Chris@82 138
Chris@82 139 static const ct_desc desc = { 4, XSIMD_STRING("q1fv_4"), twinstr, &GENUS, {36, 24, 8, 0}, 0, 0, 0 };
Chris@82 140
Chris@82 141 void XSIMD(codelet_q1fv_4) (planner *p) {
Chris@82 142 X(kdft_difsq_register) (p, q1fv_4, &desc);
Chris@82 143 }
Chris@82 144 #else
Chris@82 145
Chris@82 146 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1fv_4 -include dft/simd/q1f.h */
Chris@82 147
Chris@82 148 /*
Chris@82 149 * This function contains 44 FP additions, 24 FP multiplications,
Chris@82 150 * (or, 44 additions, 24 multiplications, 0 fused multiply/add),
Chris@82 151 * 22 stack variables, 0 constants, and 32 memory accesses
Chris@82 152 */
Chris@82 153 #include "dft/simd/q1f.h"
Chris@82 154
Chris@82 155 static void q1fv_4(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 156 {
Chris@82 157 {
Chris@82 158 INT m;
Chris@82 159 R *x;
Chris@82 160 x = ri;
Chris@82 161 for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, vs)) {
Chris@82 162 V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
Chris@82 163 V Tl;
Chris@82 164 {
Chris@82 165 V T1, T2, Ty, Tz;
Chris@82 166 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 167 T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 168 T3 = VSUB(T1, T2);
Chris@82 169 T9 = VADD(T1, T2);
Chris@82 170 Ty = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@82 171 Tz = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@82 172 TA = VSUB(Ty, Tz);
Chris@82 173 TG = VADD(Ty, Tz);
Chris@82 174 }
Chris@82 175 {
Chris@82 176 V TB, TC, T4, T5;
Chris@82 177 TB = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 178 TC = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 179 TD = VBYI(VSUB(TB, TC));
Chris@82 180 TH = VADD(TB, TC);
Chris@82 181 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 182 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 183 T6 = VBYI(VSUB(T4, T5));
Chris@82 184 Ta = VADD(T4, T5);
Chris@82 185 }
Chris@82 186 {
Chris@82 187 V Tc, Td, Tn, To;
Chris@82 188 Tc = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@82 189 Td = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@82 190 Te = VSUB(Tc, Td);
Chris@82 191 Tk = VADD(Tc, Td);
Chris@82 192 Tn = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 193 To = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 194 Tp = VSUB(Tn, To);
Chris@82 195 Tv = VADD(Tn, To);
Chris@82 196 }
Chris@82 197 {
Chris@82 198 V Tq, Tr, Tf, Tg;
Chris@82 199 Tq = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 200 Tr = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 201 Ts = VBYI(VSUB(Tq, Tr));
Chris@82 202 Tw = VADD(Tq, Tr);
Chris@82 203 Tf = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 204 Tg = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 205 Th = VBYI(VSUB(Tf, Tg));
Chris@82 206 Tl = VADD(Tf, Tg);
Chris@82 207 }
Chris@82 208 ST(&(x[0]), VADD(T9, Ta), ms, &(x[0]));
Chris@82 209 ST(&(x[WS(rs, 1)]), VADD(Tk, Tl), ms, &(x[WS(rs, 1)]));
Chris@82 210 ST(&(x[WS(rs, 2)]), VADD(Tv, Tw), ms, &(x[0]));
Chris@82 211 ST(&(x[WS(rs, 3)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
Chris@82 212 {
Chris@82 213 V T7, Ti, Tt, TE;
Chris@82 214 T7 = BYTWJ(&(W[0]), VSUB(T3, T6));
Chris@82 215 ST(&(x[WS(vs, 1)]), T7, ms, &(x[WS(vs, 1)]));
Chris@82 216 Ti = BYTWJ(&(W[0]), VSUB(Te, Th));
Chris@82 217 ST(&(x[WS(vs, 1) + WS(rs, 1)]), Ti, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 218 Tt = BYTWJ(&(W[0]), VSUB(Tp, Ts));
Chris@82 219 ST(&(x[WS(vs, 1) + WS(rs, 2)]), Tt, ms, &(x[WS(vs, 1)]));
Chris@82 220 TE = BYTWJ(&(W[0]), VSUB(TA, TD));
Chris@82 221 ST(&(x[WS(vs, 1) + WS(rs, 3)]), TE, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 222 }
Chris@82 223 {
Chris@82 224 V T8, Tj, Tu, TF;
Chris@82 225 T8 = BYTWJ(&(W[TWVL * 4]), VADD(T3, T6));
Chris@82 226 ST(&(x[WS(vs, 3)]), T8, ms, &(x[WS(vs, 3)]));
Chris@82 227 Tj = BYTWJ(&(W[TWVL * 4]), VADD(Te, Th));
Chris@82 228 ST(&(x[WS(vs, 3) + WS(rs, 1)]), Tj, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 229 Tu = BYTWJ(&(W[TWVL * 4]), VADD(Tp, Ts));
Chris@82 230 ST(&(x[WS(vs, 3) + WS(rs, 2)]), Tu, ms, &(x[WS(vs, 3)]));
Chris@82 231 TF = BYTWJ(&(W[TWVL * 4]), VADD(TA, TD));
Chris@82 232 ST(&(x[WS(vs, 3) + WS(rs, 3)]), TF, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 233 }
Chris@82 234 {
Chris@82 235 V Tb, Tm, Tx, TI;
Chris@82 236 Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
Chris@82 237 ST(&(x[WS(vs, 2)]), Tb, ms, &(x[WS(vs, 2)]));
Chris@82 238 Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
Chris@82 239 ST(&(x[WS(vs, 2) + WS(rs, 1)]), Tm, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 240 Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
Chris@82 241 ST(&(x[WS(vs, 2) + WS(rs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
Chris@82 242 TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
Chris@82 243 ST(&(x[WS(vs, 2) + WS(rs, 3)]), TI, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 244 }
Chris@82 245 }
Chris@82 246 }
Chris@82 247 VLEAVE();
Chris@82 248 }
Chris@82 249
Chris@82 250 static const tw_instr twinstr[] = {
Chris@82 251 VTW(0, 1),
Chris@82 252 VTW(0, 2),
Chris@82 253 VTW(0, 3),
Chris@82 254 {TW_NEXT, VL, 0}
Chris@82 255 };
Chris@82 256
Chris@82 257 static const ct_desc desc = { 4, XSIMD_STRING("q1fv_4"), twinstr, &GENUS, {44, 24, 0, 0}, 0, 0, 0 };
Chris@82 258
Chris@82 259 void XSIMD(codelet_q1fv_4) (planner *p) {
Chris@82 260 X(kdft_difsq_register) (p, q1fv_4, &desc);
Chris@82 261 }
Chris@82 262 #endif