annotate src/fftw-3.3.8/dft/scalar/codelets/q1_3.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:30 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 48 FP additions, 42 FP multiplications,
Chris@82 32 * (or, 18 additions, 12 multiplications, 30 fused multiply/add),
Chris@82 33 * 35 stack variables, 2 constants, and 36 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/q.h"
Chris@82 36
Chris@82 37 static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT m;
Chris@82 43 for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@82 44 E T1, T4, T6, Tg, Td, Te, T9, Tf, Tp, Ts, Tu, TE, TB, TC, Tx;
Chris@82 45 E TD, TZ, T10, TV, T11, TN, TQ, TS, T12;
Chris@82 46 {
Chris@82 47 E T2, T3, Tv, Tw;
Chris@82 48 T1 = rio[0];
Chris@82 49 T2 = rio[WS(rs, 1)];
Chris@82 50 T3 = rio[WS(rs, 2)];
Chris@82 51 T4 = T2 + T3;
Chris@82 52 T6 = FNMS(KP500000000, T4, T1);
Chris@82 53 Tg = T3 - T2;
Chris@82 54 {
Chris@82 55 E T7, T8, Tq, Tr;
Chris@82 56 Td = iio[0];
Chris@82 57 T7 = iio[WS(rs, 1)];
Chris@82 58 T8 = iio[WS(rs, 2)];
Chris@82 59 Te = T7 + T8;
Chris@82 60 T9 = T7 - T8;
Chris@82 61 Tf = FNMS(KP500000000, Te, Td);
Chris@82 62 Tp = rio[WS(vs, 1)];
Chris@82 63 Tq = rio[WS(vs, 1) + WS(rs, 1)];
Chris@82 64 Tr = rio[WS(vs, 1) + WS(rs, 2)];
Chris@82 65 Ts = Tq + Tr;
Chris@82 66 Tu = FNMS(KP500000000, Ts, Tp);
Chris@82 67 TE = Tr - Tq;
Chris@82 68 }
Chris@82 69 TB = iio[WS(vs, 1)];
Chris@82 70 Tv = iio[WS(vs, 1) + WS(rs, 1)];
Chris@82 71 Tw = iio[WS(vs, 1) + WS(rs, 2)];
Chris@82 72 TC = Tv + Tw;
Chris@82 73 Tx = Tv - Tw;
Chris@82 74 TD = FNMS(KP500000000, TC, TB);
Chris@82 75 {
Chris@82 76 E TT, TU, TO, TP;
Chris@82 77 TZ = iio[WS(vs, 2)];
Chris@82 78 TT = iio[WS(vs, 2) + WS(rs, 1)];
Chris@82 79 TU = iio[WS(vs, 2) + WS(rs, 2)];
Chris@82 80 T10 = TT + TU;
Chris@82 81 TV = TT - TU;
Chris@82 82 T11 = FNMS(KP500000000, T10, TZ);
Chris@82 83 TN = rio[WS(vs, 2)];
Chris@82 84 TO = rio[WS(vs, 2) + WS(rs, 1)];
Chris@82 85 TP = rio[WS(vs, 2) + WS(rs, 2)];
Chris@82 86 TQ = TO + TP;
Chris@82 87 TS = FNMS(KP500000000, TQ, TN);
Chris@82 88 T12 = TP - TO;
Chris@82 89 }
Chris@82 90 }
Chris@82 91 rio[0] = T1 + T4;
Chris@82 92 iio[0] = Td + Te;
Chris@82 93 rio[WS(rs, 1)] = Tp + Ts;
Chris@82 94 iio[WS(rs, 1)] = TB + TC;
Chris@82 95 iio[WS(rs, 2)] = TZ + T10;
Chris@82 96 rio[WS(rs, 2)] = TN + TQ;
Chris@82 97 {
Chris@82 98 E Ta, Th, Tb, Ti, T5, Tc;
Chris@82 99 Ta = FMA(KP866025403, T9, T6);
Chris@82 100 Th = FMA(KP866025403, Tg, Tf);
Chris@82 101 T5 = W[0];
Chris@82 102 Tb = T5 * Ta;
Chris@82 103 Ti = T5 * Th;
Chris@82 104 Tc = W[1];
Chris@82 105 rio[WS(vs, 1)] = FMA(Tc, Th, Tb);
Chris@82 106 iio[WS(vs, 1)] = FNMS(Tc, Ta, Ti);
Chris@82 107 }
Chris@82 108 {
Chris@82 109 E T16, T19, T17, T1a, T15, T18;
Chris@82 110 T16 = FNMS(KP866025403, TV, TS);
Chris@82 111 T19 = FNMS(KP866025403, T12, T11);
Chris@82 112 T15 = W[2];
Chris@82 113 T17 = T15 * T16;
Chris@82 114 T1a = T15 * T19;
Chris@82 115 T18 = W[3];
Chris@82 116 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T18, T19, T17);
Chris@82 117 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T18, T16, T1a);
Chris@82 118 }
Chris@82 119 {
Chris@82 120 E TI, TL, TJ, TM, TH, TK;
Chris@82 121 TI = FNMS(KP866025403, Tx, Tu);
Chris@82 122 TL = FNMS(KP866025403, TE, TD);
Chris@82 123 TH = W[2];
Chris@82 124 TJ = TH * TI;
Chris@82 125 TM = TH * TL;
Chris@82 126 TK = W[3];
Chris@82 127 rio[WS(vs, 2) + WS(rs, 1)] = FMA(TK, TL, TJ);
Chris@82 128 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TK, TI, TM);
Chris@82 129 }
Chris@82 130 {
Chris@82 131 E Ty, TF, Tz, TG, Tt, TA;
Chris@82 132 Ty = FMA(KP866025403, Tx, Tu);
Chris@82 133 TF = FMA(KP866025403, TE, TD);
Chris@82 134 Tt = W[0];
Chris@82 135 Tz = Tt * Ty;
Chris@82 136 TG = Tt * TF;
Chris@82 137 TA = W[1];
Chris@82 138 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TA, TF, Tz);
Chris@82 139 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TA, Ty, TG);
Chris@82 140 }
Chris@82 141 {
Chris@82 142 E TW, T13, TX, T14, TR, TY;
Chris@82 143 TW = FMA(KP866025403, TV, TS);
Chris@82 144 T13 = FMA(KP866025403, T12, T11);
Chris@82 145 TR = W[0];
Chris@82 146 TX = TR * TW;
Chris@82 147 T14 = TR * T13;
Chris@82 148 TY = W[1];
Chris@82 149 rio[WS(vs, 1) + WS(rs, 2)] = FMA(TY, T13, TX);
Chris@82 150 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TY, TW, T14);
Chris@82 151 }
Chris@82 152 {
Chris@82 153 E Tk, Tn, Tl, To, Tj, Tm;
Chris@82 154 Tk = FNMS(KP866025403, T9, T6);
Chris@82 155 Tn = FNMS(KP866025403, Tg, Tf);
Chris@82 156 Tj = W[2];
Chris@82 157 Tl = Tj * Tk;
Chris@82 158 To = Tj * Tn;
Chris@82 159 Tm = W[3];
Chris@82 160 rio[WS(vs, 2)] = FMA(Tm, Tn, Tl);
Chris@82 161 iio[WS(vs, 2)] = FNMS(Tm, Tk, To);
Chris@82 162 }
Chris@82 163 }
Chris@82 164 }
Chris@82 165 }
Chris@82 166
Chris@82 167 static const tw_instr twinstr[] = {
Chris@82 168 {TW_FULL, 0, 3},
Chris@82 169 {TW_NEXT, 1, 0}
Chris@82 170 };
Chris@82 171
Chris@82 172 static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, {18, 12, 30, 0}, 0, 0, 0 };
Chris@82 173
Chris@82 174 void X(codelet_q1_3) (planner *p) {
Chris@82 175 X(kdft_difsq_register) (p, q1_3, &desc);
Chris@82 176 }
Chris@82 177 #else
Chris@82 178
Chris@82 179 /* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
Chris@82 180
Chris@82 181 /*
Chris@82 182 * This function contains 48 FP additions, 36 FP multiplications,
Chris@82 183 * (or, 30 additions, 18 multiplications, 18 fused multiply/add),
Chris@82 184 * 35 stack variables, 2 constants, and 36 memory accesses
Chris@82 185 */
Chris@82 186 #include "dft/scalar/q.h"
Chris@82 187
Chris@82 188 static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 189 {
Chris@82 190 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 191 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 192 {
Chris@82 193 INT m;
Chris@82 194 for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@82 195 E T1, T4, T6, Tc, Td, Te, T9, Tf, Tl, To, Tq, Tw, Tx, Ty, Tt;
Chris@82 196 E Tz, TR, TS, TN, TT, TF, TI, TK, TQ;
Chris@82 197 {
Chris@82 198 E T2, T3, Tr, Ts;
Chris@82 199 T1 = rio[0];
Chris@82 200 T2 = rio[WS(rs, 1)];
Chris@82 201 T3 = rio[WS(rs, 2)];
Chris@82 202 T4 = T2 + T3;
Chris@82 203 T6 = FNMS(KP500000000, T4, T1);
Chris@82 204 Tc = KP866025403 * (T3 - T2);
Chris@82 205 {
Chris@82 206 E T7, T8, Tm, Tn;
Chris@82 207 Td = iio[0];
Chris@82 208 T7 = iio[WS(rs, 1)];
Chris@82 209 T8 = iio[WS(rs, 2)];
Chris@82 210 Te = T7 + T8;
Chris@82 211 T9 = KP866025403 * (T7 - T8);
Chris@82 212 Tf = FNMS(KP500000000, Te, Td);
Chris@82 213 Tl = rio[WS(vs, 1)];
Chris@82 214 Tm = rio[WS(vs, 1) + WS(rs, 1)];
Chris@82 215 Tn = rio[WS(vs, 1) + WS(rs, 2)];
Chris@82 216 To = Tm + Tn;
Chris@82 217 Tq = FNMS(KP500000000, To, Tl);
Chris@82 218 Tw = KP866025403 * (Tn - Tm);
Chris@82 219 }
Chris@82 220 Tx = iio[WS(vs, 1)];
Chris@82 221 Tr = iio[WS(vs, 1) + WS(rs, 1)];
Chris@82 222 Ts = iio[WS(vs, 1) + WS(rs, 2)];
Chris@82 223 Ty = Tr + Ts;
Chris@82 224 Tt = KP866025403 * (Tr - Ts);
Chris@82 225 Tz = FNMS(KP500000000, Ty, Tx);
Chris@82 226 {
Chris@82 227 E TL, TM, TG, TH;
Chris@82 228 TR = iio[WS(vs, 2)];
Chris@82 229 TL = iio[WS(vs, 2) + WS(rs, 1)];
Chris@82 230 TM = iio[WS(vs, 2) + WS(rs, 2)];
Chris@82 231 TS = TL + TM;
Chris@82 232 TN = KP866025403 * (TL - TM);
Chris@82 233 TT = FNMS(KP500000000, TS, TR);
Chris@82 234 TF = rio[WS(vs, 2)];
Chris@82 235 TG = rio[WS(vs, 2) + WS(rs, 1)];
Chris@82 236 TH = rio[WS(vs, 2) + WS(rs, 2)];
Chris@82 237 TI = TG + TH;
Chris@82 238 TK = FNMS(KP500000000, TI, TF);
Chris@82 239 TQ = KP866025403 * (TH - TG);
Chris@82 240 }
Chris@82 241 }
Chris@82 242 rio[0] = T1 + T4;
Chris@82 243 iio[0] = Td + Te;
Chris@82 244 rio[WS(rs, 1)] = Tl + To;
Chris@82 245 iio[WS(rs, 1)] = Tx + Ty;
Chris@82 246 iio[WS(rs, 2)] = TR + TS;
Chris@82 247 rio[WS(rs, 2)] = TF + TI;
Chris@82 248 {
Chris@82 249 E Ta, Tg, T5, Tb;
Chris@82 250 Ta = T6 + T9;
Chris@82 251 Tg = Tc + Tf;
Chris@82 252 T5 = W[0];
Chris@82 253 Tb = W[1];
Chris@82 254 rio[WS(vs, 1)] = FMA(T5, Ta, Tb * Tg);
Chris@82 255 iio[WS(vs, 1)] = FNMS(Tb, Ta, T5 * Tg);
Chris@82 256 }
Chris@82 257 {
Chris@82 258 E TW, TY, TV, TX;
Chris@82 259 TW = TK - TN;
Chris@82 260 TY = TT - TQ;
Chris@82 261 TV = W[2];
Chris@82 262 TX = W[3];
Chris@82 263 rio[WS(vs, 2) + WS(rs, 2)] = FMA(TV, TW, TX * TY);
Chris@82 264 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(TX, TW, TV * TY);
Chris@82 265 }
Chris@82 266 {
Chris@82 267 E TC, TE, TB, TD;
Chris@82 268 TC = Tq - Tt;
Chris@82 269 TE = Tz - Tw;
Chris@82 270 TB = W[2];
Chris@82 271 TD = W[3];
Chris@82 272 rio[WS(vs, 2) + WS(rs, 1)] = FMA(TB, TC, TD * TE);
Chris@82 273 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TD, TC, TB * TE);
Chris@82 274 }
Chris@82 275 {
Chris@82 276 E Tu, TA, Tp, Tv;
Chris@82 277 Tu = Tq + Tt;
Chris@82 278 TA = Tw + Tz;
Chris@82 279 Tp = W[0];
Chris@82 280 Tv = W[1];
Chris@82 281 rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tp, Tu, Tv * TA);
Chris@82 282 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tv, Tu, Tp * TA);
Chris@82 283 }
Chris@82 284 {
Chris@82 285 E TO, TU, TJ, TP;
Chris@82 286 TO = TK + TN;
Chris@82 287 TU = TQ + TT;
Chris@82 288 TJ = W[0];
Chris@82 289 TP = W[1];
Chris@82 290 rio[WS(vs, 1) + WS(rs, 2)] = FMA(TJ, TO, TP * TU);
Chris@82 291 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TP, TO, TJ * TU);
Chris@82 292 }
Chris@82 293 {
Chris@82 294 E Ti, Tk, Th, Tj;
Chris@82 295 Ti = T6 - T9;
Chris@82 296 Tk = Tf - Tc;
Chris@82 297 Th = W[2];
Chris@82 298 Tj = W[3];
Chris@82 299 rio[WS(vs, 2)] = FMA(Th, Ti, Tj * Tk);
Chris@82 300 iio[WS(vs, 2)] = FNMS(Tj, Ti, Th * Tk);
Chris@82 301 }
Chris@82 302 }
Chris@82 303 }
Chris@82 304 }
Chris@82 305
Chris@82 306 static const tw_instr twinstr[] = {
Chris@82 307 {TW_FULL, 0, 3},
Chris@82 308 {TW_NEXT, 1, 0}
Chris@82 309 };
Chris@82 310
Chris@82 311 static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, {30, 18, 18, 0}, 0, 0, 0 };
Chris@82 312
Chris@82 313 void X(codelet_q1_3) (planner *p) {
Chris@82 314 X(kdft_difsq_register) (p, q1_3, &desc);
Chris@82 315 }
Chris@82 316 #endif