annotate src/fftw-3.3.8/rdft/scalar/r2cb/r2cb_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:29 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cb_16 -include rdft/scalar/r2cb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 58 FP additions, 32 FP multiplications,
Chris@82 32 * (or, 26 additions, 0 multiplications, 32 fused multiply/add),
Chris@82 33 * 31 stack variables, 4 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/r2cb.h"
Chris@82 36
Chris@82 37 static void r2cb_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@82 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@82 42 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT i;
Chris@82 45 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@82 46 E T5, TL, Tj, TD, T8, TM, To, TE, Tc, TP, Tf, TQ, Tu, Tz, TR;
Chris@82 47 E TO, TH, TG;
Chris@82 48 {
Chris@82 49 E T4, Ti, T3, Th, T1, T2;
Chris@82 50 T4 = Cr[WS(csr, 4)];
Chris@82 51 Ti = Ci[WS(csi, 4)];
Chris@82 52 T1 = Cr[0];
Chris@82 53 T2 = Cr[WS(csr, 8)];
Chris@82 54 T3 = T1 + T2;
Chris@82 55 Th = T1 - T2;
Chris@82 56 T5 = FMA(KP2_000000000, T4, T3);
Chris@82 57 TL = FNMS(KP2_000000000, T4, T3);
Chris@82 58 Tj = FNMS(KP2_000000000, Ti, Th);
Chris@82 59 TD = FMA(KP2_000000000, Ti, Th);
Chris@82 60 }
Chris@82 61 {
Chris@82 62 E T6, T7, Tk, Tl, Tm, Tn;
Chris@82 63 T6 = Cr[WS(csr, 2)];
Chris@82 64 T7 = Cr[WS(csr, 6)];
Chris@82 65 Tk = T6 - T7;
Chris@82 66 Tl = Ci[WS(csi, 2)];
Chris@82 67 Tm = Ci[WS(csi, 6)];
Chris@82 68 Tn = Tl + Tm;
Chris@82 69 T8 = T6 + T7;
Chris@82 70 TM = Tl - Tm;
Chris@82 71 To = Tk - Tn;
Chris@82 72 TE = Tk + Tn;
Chris@82 73 }
Chris@82 74 {
Chris@82 75 E Tq, Ty, Tv, Tt;
Chris@82 76 {
Chris@82 77 E Ta, Tb, Tw, Tx;
Chris@82 78 Ta = Cr[WS(csr, 1)];
Chris@82 79 Tb = Cr[WS(csr, 7)];
Chris@82 80 Tc = Ta + Tb;
Chris@82 81 Tq = Ta - Tb;
Chris@82 82 Tw = Ci[WS(csi, 1)];
Chris@82 83 Tx = Ci[WS(csi, 7)];
Chris@82 84 Ty = Tw + Tx;
Chris@82 85 TP = Tw - Tx;
Chris@82 86 }
Chris@82 87 {
Chris@82 88 E Td, Te, Tr, Ts;
Chris@82 89 Td = Cr[WS(csr, 5)];
Chris@82 90 Te = Cr[WS(csr, 3)];
Chris@82 91 Tf = Td + Te;
Chris@82 92 Tv = Td - Te;
Chris@82 93 Tr = Ci[WS(csi, 5)];
Chris@82 94 Ts = Ci[WS(csi, 3)];
Chris@82 95 Tt = Tr + Ts;
Chris@82 96 TQ = Tr - Ts;
Chris@82 97 }
Chris@82 98 Tu = Tq - Tt;
Chris@82 99 Tz = Tv + Ty;
Chris@82 100 TR = TP - TQ;
Chris@82 101 TO = Tc - Tf;
Chris@82 102 TH = Tq + Tt;
Chris@82 103 TG = Ty - Tv;
Chris@82 104 }
Chris@82 105 {
Chris@82 106 E T9, Tg, TT, TU;
Chris@82 107 T9 = FMA(KP2_000000000, T8, T5);
Chris@82 108 Tg = Tc + Tf;
Chris@82 109 R0[WS(rs, 4)] = FNMS(KP2_000000000, Tg, T9);
Chris@82 110 R0[0] = FMA(KP2_000000000, Tg, T9);
Chris@82 111 TT = FMA(KP2_000000000, TM, TL);
Chris@82 112 TU = TO + TR;
Chris@82 113 R0[WS(rs, 3)] = FNMS(KP1_414213562, TU, TT);
Chris@82 114 R0[WS(rs, 7)] = FMA(KP1_414213562, TU, TT);
Chris@82 115 }
Chris@82 116 {
Chris@82 117 E TV, TW, Tp, TA;
Chris@82 118 TV = FNMS(KP2_000000000, T8, T5);
Chris@82 119 TW = TQ + TP;
Chris@82 120 R0[WS(rs, 2)] = FNMS(KP2_000000000, TW, TV);
Chris@82 121 R0[WS(rs, 6)] = FMA(KP2_000000000, TW, TV);
Chris@82 122 Tp = FMA(KP1_414213562, To, Tj);
Chris@82 123 TA = FNMS(KP414213562, Tz, Tu);
Chris@82 124 R1[WS(rs, 4)] = FNMS(KP1_847759065, TA, Tp);
Chris@82 125 R1[0] = FMA(KP1_847759065, TA, Tp);
Chris@82 126 }
Chris@82 127 {
Chris@82 128 E TB, TC, TJ, TK;
Chris@82 129 TB = FNMS(KP1_414213562, To, Tj);
Chris@82 130 TC = FMA(KP414213562, Tu, Tz);
Chris@82 131 R1[WS(rs, 2)] = FNMS(KP1_847759065, TC, TB);
Chris@82 132 R1[WS(rs, 6)] = FMA(KP1_847759065, TC, TB);
Chris@82 133 TJ = FMA(KP1_414213562, TE, TD);
Chris@82 134 TK = FMA(KP414213562, TG, TH);
Chris@82 135 R1[WS(rs, 3)] = FNMS(KP1_847759065, TK, TJ);
Chris@82 136 R1[WS(rs, 7)] = FMA(KP1_847759065, TK, TJ);
Chris@82 137 }
Chris@82 138 {
Chris@82 139 E TN, TS, TF, TI;
Chris@82 140 TN = FNMS(KP2_000000000, TM, TL);
Chris@82 141 TS = TO - TR;
Chris@82 142 R0[WS(rs, 5)] = FNMS(KP1_414213562, TS, TN);
Chris@82 143 R0[WS(rs, 1)] = FMA(KP1_414213562, TS, TN);
Chris@82 144 TF = FNMS(KP1_414213562, TE, TD);
Chris@82 145 TI = FNMS(KP414213562, TH, TG);
Chris@82 146 R1[WS(rs, 1)] = FNMS(KP1_847759065, TI, TF);
Chris@82 147 R1[WS(rs, 5)] = FMA(KP1_847759065, TI, TF);
Chris@82 148 }
Chris@82 149 }
Chris@82 150 }
Chris@82 151 }
Chris@82 152
Chris@82 153 static const kr2c_desc desc = { 16, "r2cb_16", {26, 0, 32, 0}, &GENUS };
Chris@82 154
Chris@82 155 void X(codelet_r2cb_16) (planner *p) {
Chris@82 156 X(kr2c_register) (p, r2cb_16, &desc);
Chris@82 157 }
Chris@82 158
Chris@82 159 #else
Chris@82 160
Chris@82 161 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cb_16 -include rdft/scalar/r2cb.h */
Chris@82 162
Chris@82 163 /*
Chris@82 164 * This function contains 58 FP additions, 18 FP multiplications,
Chris@82 165 * (or, 54 additions, 14 multiplications, 4 fused multiply/add),
Chris@82 166 * 31 stack variables, 4 constants, and 32 memory accesses
Chris@82 167 */
Chris@82 168 #include "rdft/scalar/r2cb.h"
Chris@82 169
Chris@82 170 static void r2cb_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@82 171 {
Chris@82 172 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@82 173 DK(KP765366864, +0.765366864730179543456919968060797733522689125);
Chris@82 174 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@82 175 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@82 176 {
Chris@82 177 INT i;
Chris@82 178 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@82 179 E T9, TS, Tl, TG, T6, TR, Ti, TD, Td, Tq, Tg, Tt, Tn, Tu, TV;
Chris@82 180 E TU, TN, TK;
Chris@82 181 {
Chris@82 182 E T7, T8, TE, Tj, Tk, TF;
Chris@82 183 T7 = Cr[WS(csr, 2)];
Chris@82 184 T8 = Cr[WS(csr, 6)];
Chris@82 185 TE = T7 - T8;
Chris@82 186 Tj = Ci[WS(csi, 2)];
Chris@82 187 Tk = Ci[WS(csi, 6)];
Chris@82 188 TF = Tj + Tk;
Chris@82 189 T9 = KP2_000000000 * (T7 + T8);
Chris@82 190 TS = KP1_414213562 * (TE + TF);
Chris@82 191 Tl = KP2_000000000 * (Tj - Tk);
Chris@82 192 TG = KP1_414213562 * (TE - TF);
Chris@82 193 }
Chris@82 194 {
Chris@82 195 E T5, TC, T3, TA;
Chris@82 196 {
Chris@82 197 E T4, TB, T1, T2;
Chris@82 198 T4 = Cr[WS(csr, 4)];
Chris@82 199 T5 = KP2_000000000 * T4;
Chris@82 200 TB = Ci[WS(csi, 4)];
Chris@82 201 TC = KP2_000000000 * TB;
Chris@82 202 T1 = Cr[0];
Chris@82 203 T2 = Cr[WS(csr, 8)];
Chris@82 204 T3 = T1 + T2;
Chris@82 205 TA = T1 - T2;
Chris@82 206 }
Chris@82 207 T6 = T3 + T5;
Chris@82 208 TR = TA + TC;
Chris@82 209 Ti = T3 - T5;
Chris@82 210 TD = TA - TC;
Chris@82 211 }
Chris@82 212 {
Chris@82 213 E TI, TM, TL, TJ;
Chris@82 214 {
Chris@82 215 E Tb, Tc, To, Tp;
Chris@82 216 Tb = Cr[WS(csr, 1)];
Chris@82 217 Tc = Cr[WS(csr, 7)];
Chris@82 218 Td = Tb + Tc;
Chris@82 219 TI = Tb - Tc;
Chris@82 220 To = Ci[WS(csi, 1)];
Chris@82 221 Tp = Ci[WS(csi, 7)];
Chris@82 222 Tq = To - Tp;
Chris@82 223 TM = To + Tp;
Chris@82 224 }
Chris@82 225 {
Chris@82 226 E Te, Tf, Tr, Ts;
Chris@82 227 Te = Cr[WS(csr, 5)];
Chris@82 228 Tf = Cr[WS(csr, 3)];
Chris@82 229 Tg = Te + Tf;
Chris@82 230 TL = Te - Tf;
Chris@82 231 Tr = Ci[WS(csi, 5)];
Chris@82 232 Ts = Ci[WS(csi, 3)];
Chris@82 233 Tt = Tr - Ts;
Chris@82 234 TJ = Tr + Ts;
Chris@82 235 }
Chris@82 236 Tn = Td - Tg;
Chris@82 237 Tu = Tq - Tt;
Chris@82 238 TV = TM - TL;
Chris@82 239 TU = TI + TJ;
Chris@82 240 TN = TL + TM;
Chris@82 241 TK = TI - TJ;
Chris@82 242 }
Chris@82 243 {
Chris@82 244 E Ta, Th, TT, TW;
Chris@82 245 Ta = T6 + T9;
Chris@82 246 Th = KP2_000000000 * (Td + Tg);
Chris@82 247 R0[WS(rs, 4)] = Ta - Th;
Chris@82 248 R0[0] = Ta + Th;
Chris@82 249 TT = TR - TS;
Chris@82 250 TW = FNMS(KP1_847759065, TV, KP765366864 * TU);
Chris@82 251 R1[WS(rs, 5)] = TT - TW;
Chris@82 252 R1[WS(rs, 1)] = TT + TW;
Chris@82 253 }
Chris@82 254 {
Chris@82 255 E TX, TY, Tm, Tv;
Chris@82 256 TX = TR + TS;
Chris@82 257 TY = FMA(KP1_847759065, TU, KP765366864 * TV);
Chris@82 258 R1[WS(rs, 3)] = TX - TY;
Chris@82 259 R1[WS(rs, 7)] = TX + TY;
Chris@82 260 Tm = Ti - Tl;
Chris@82 261 Tv = KP1_414213562 * (Tn - Tu);
Chris@82 262 R0[WS(rs, 5)] = Tm - Tv;
Chris@82 263 R0[WS(rs, 1)] = Tm + Tv;
Chris@82 264 }
Chris@82 265 {
Chris@82 266 E Tw, Tx, TH, TO;
Chris@82 267 Tw = Ti + Tl;
Chris@82 268 Tx = KP1_414213562 * (Tn + Tu);
Chris@82 269 R0[WS(rs, 3)] = Tw - Tx;
Chris@82 270 R0[WS(rs, 7)] = Tw + Tx;
Chris@82 271 TH = TD + TG;
Chris@82 272 TO = FNMS(KP765366864, TN, KP1_847759065 * TK);
Chris@82 273 R1[WS(rs, 4)] = TH - TO;
Chris@82 274 R1[0] = TH + TO;
Chris@82 275 }
Chris@82 276 {
Chris@82 277 E TP, TQ, Ty, Tz;
Chris@82 278 TP = TD - TG;
Chris@82 279 TQ = FMA(KP765366864, TK, KP1_847759065 * TN);
Chris@82 280 R1[WS(rs, 2)] = TP - TQ;
Chris@82 281 R1[WS(rs, 6)] = TP + TQ;
Chris@82 282 Ty = T6 - T9;
Chris@82 283 Tz = KP2_000000000 * (Tt + Tq);
Chris@82 284 R0[WS(rs, 2)] = Ty - Tz;
Chris@82 285 R0[WS(rs, 6)] = Ty + Tz;
Chris@82 286 }
Chris@82 287 }
Chris@82 288 }
Chris@82 289 }
Chris@82 290
Chris@82 291 static const kr2c_desc desc = { 16, "r2cb_16", {54, 14, 4, 0}, &GENUS };
Chris@82 292
Chris@82 293 void X(codelet_r2cb_16) (planner *p) {
Chris@82 294 X(kr2c_register) (p, r2cb_16, &desc);
Chris@82 295 }
Chris@82 296
Chris@82 297 #endif