annotate src/fftw-3.3.5/rdft/scalar/r2cb/r2cb_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:49:30 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cb_16 -include r2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 58 FP additions, 32 FP multiplications,
Chris@42 32 * (or, 26 additions, 0 multiplications, 32 fused multiply/add),
Chris@42 33 * 47 stack variables, 4 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cb.h"
Chris@42 36
Chris@42 37 static void r2cb_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@42 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 41 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@42 42 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT i;
Chris@42 45 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@42 46 E TN, TS, TF, TI;
Chris@42 47 {
Chris@42 48 E T8, TD, Tj, TL, T5, TM, TE, To, Td, Tq, Tc, TP, Ty, Te, Tr;
Chris@42 49 E Ts;
Chris@42 50 {
Chris@42 51 E T4, Ti, T1, T2;
Chris@42 52 T4 = Cr[WS(csr, 4)];
Chris@42 53 Ti = Ci[WS(csi, 4)];
Chris@42 54 T1 = Cr[0];
Chris@42 55 T2 = Cr[WS(csr, 8)];
Chris@42 56 {
Chris@42 57 E Tk, Tn, T6, T7;
Chris@42 58 T6 = Cr[WS(csr, 2)];
Chris@42 59 T7 = Cr[WS(csr, 6)];
Chris@42 60 {
Chris@42 61 E Tl, Th, T3, Tm;
Chris@42 62 Tl = Ci[WS(csi, 2)];
Chris@42 63 Th = T1 - T2;
Chris@42 64 T3 = T1 + T2;
Chris@42 65 Tk = T6 - T7;
Chris@42 66 T8 = T6 + T7;
Chris@42 67 Tm = Ci[WS(csi, 6)];
Chris@42 68 TD = FMA(KP2_000000000, Ti, Th);
Chris@42 69 Tj = FNMS(KP2_000000000, Ti, Th);
Chris@42 70 TL = FNMS(KP2_000000000, T4, T3);
Chris@42 71 T5 = FMA(KP2_000000000, T4, T3);
Chris@42 72 Tn = Tl + Tm;
Chris@42 73 TM = Tl - Tm;
Chris@42 74 }
Chris@42 75 {
Chris@42 76 E Ta, Tb, Tw, Tx;
Chris@42 77 Ta = Cr[WS(csr, 1)];
Chris@42 78 TE = Tk + Tn;
Chris@42 79 To = Tk - Tn;
Chris@42 80 Tb = Cr[WS(csr, 7)];
Chris@42 81 Tw = Ci[WS(csi, 1)];
Chris@42 82 Tx = Ci[WS(csi, 7)];
Chris@42 83 Td = Cr[WS(csr, 5)];
Chris@42 84 Tq = Ta - Tb;
Chris@42 85 Tc = Ta + Tb;
Chris@42 86 TP = Tw - Tx;
Chris@42 87 Ty = Tw + Tx;
Chris@42 88 Te = Cr[WS(csr, 3)];
Chris@42 89 Tr = Ci[WS(csi, 5)];
Chris@42 90 Ts = Ci[WS(csi, 3)];
Chris@42 91 }
Chris@42 92 }
Chris@42 93 }
Chris@42 94 {
Chris@42 95 E TV, TG, TW, TH, TB, Tp, TA, TC, TJ, TK;
Chris@42 96 {
Chris@42 97 E T9, Tz, Tg, Tu, TT, TU, TO, TR;
Chris@42 98 TV = FNMS(KP2_000000000, T8, T5);
Chris@42 99 T9 = FMA(KP2_000000000, T8, T5);
Chris@42 100 {
Chris@42 101 E Tv, Tf, TQ, Tt;
Chris@42 102 Tv = Td - Te;
Chris@42 103 Tf = Td + Te;
Chris@42 104 TQ = Tr - Ts;
Chris@42 105 Tt = Tr + Ts;
Chris@42 106 TG = Ty - Tv;
Chris@42 107 Tz = Tv + Ty;
Chris@42 108 TO = Tc - Tf;
Chris@42 109 Tg = Tc + Tf;
Chris@42 110 TW = TQ + TP;
Chris@42 111 TR = TP - TQ;
Chris@42 112 TH = Tq + Tt;
Chris@42 113 Tu = Tq - Tt;
Chris@42 114 }
Chris@42 115 TN = FNMS(KP2_000000000, TM, TL);
Chris@42 116 TT = FMA(KP2_000000000, TM, TL);
Chris@42 117 TU = TO + TR;
Chris@42 118 TS = TO - TR;
Chris@42 119 R0[0] = FMA(KP2_000000000, Tg, T9);
Chris@42 120 R0[WS(rs, 4)] = FNMS(KP2_000000000, Tg, T9);
Chris@42 121 R0[WS(rs, 7)] = FMA(KP1_414213562, TU, TT);
Chris@42 122 R0[WS(rs, 3)] = FNMS(KP1_414213562, TU, TT);
Chris@42 123 TB = FNMS(KP1_414213562, To, Tj);
Chris@42 124 Tp = FMA(KP1_414213562, To, Tj);
Chris@42 125 TA = FNMS(KP414213562, Tz, Tu);
Chris@42 126 TC = FMA(KP414213562, Tu, Tz);
Chris@42 127 }
Chris@42 128 R0[WS(rs, 6)] = FMA(KP2_000000000, TW, TV);
Chris@42 129 R0[WS(rs, 2)] = FNMS(KP2_000000000, TW, TV);
Chris@42 130 R1[0] = FMA(KP1_847759065, TA, Tp);
Chris@42 131 R1[WS(rs, 4)] = FNMS(KP1_847759065, TA, Tp);
Chris@42 132 TF = FNMS(KP1_414213562, TE, TD);
Chris@42 133 TJ = FMA(KP1_414213562, TE, TD);
Chris@42 134 TK = FMA(KP414213562, TG, TH);
Chris@42 135 TI = FNMS(KP414213562, TH, TG);
Chris@42 136 R1[WS(rs, 6)] = FMA(KP1_847759065, TC, TB);
Chris@42 137 R1[WS(rs, 2)] = FNMS(KP1_847759065, TC, TB);
Chris@42 138 R1[WS(rs, 7)] = FMA(KP1_847759065, TK, TJ);
Chris@42 139 R1[WS(rs, 3)] = FNMS(KP1_847759065, TK, TJ);
Chris@42 140 }
Chris@42 141 }
Chris@42 142 R0[WS(rs, 1)] = FMA(KP1_414213562, TS, TN);
Chris@42 143 R0[WS(rs, 5)] = FNMS(KP1_414213562, TS, TN);
Chris@42 144 R1[WS(rs, 5)] = FMA(KP1_847759065, TI, TF);
Chris@42 145 R1[WS(rs, 1)] = FNMS(KP1_847759065, TI, TF);
Chris@42 146 }
Chris@42 147 }
Chris@42 148 }
Chris@42 149
Chris@42 150 static const kr2c_desc desc = { 16, "r2cb_16", {26, 0, 32, 0}, &GENUS };
Chris@42 151
Chris@42 152 void X(codelet_r2cb_16) (planner *p) {
Chris@42 153 X(kr2c_register) (p, r2cb_16, &desc);
Chris@42 154 }
Chris@42 155
Chris@42 156 #else /* HAVE_FMA */
Chris@42 157
Chris@42 158 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cb_16 -include r2cb.h */
Chris@42 159
Chris@42 160 /*
Chris@42 161 * This function contains 58 FP additions, 18 FP multiplications,
Chris@42 162 * (or, 54 additions, 14 multiplications, 4 fused multiply/add),
Chris@42 163 * 31 stack variables, 4 constants, and 32 memory accesses
Chris@42 164 */
Chris@42 165 #include "r2cb.h"
Chris@42 166
Chris@42 167 static void r2cb_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 168 {
Chris@42 169 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@42 170 DK(KP765366864, +0.765366864730179543456919968060797733522689125);
Chris@42 171 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@42 172 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 173 {
Chris@42 174 INT i;
Chris@42 175 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@42 176 E T9, TS, Tl, TG, T6, TR, Ti, TD, Td, Tq, Tg, Tt, Tn, Tu, TV;
Chris@42 177 E TU, TN, TK;
Chris@42 178 {
Chris@42 179 E T7, T8, TE, Tj, Tk, TF;
Chris@42 180 T7 = Cr[WS(csr, 2)];
Chris@42 181 T8 = Cr[WS(csr, 6)];
Chris@42 182 TE = T7 - T8;
Chris@42 183 Tj = Ci[WS(csi, 2)];
Chris@42 184 Tk = Ci[WS(csi, 6)];
Chris@42 185 TF = Tj + Tk;
Chris@42 186 T9 = KP2_000000000 * (T7 + T8);
Chris@42 187 TS = KP1_414213562 * (TE + TF);
Chris@42 188 Tl = KP2_000000000 * (Tj - Tk);
Chris@42 189 TG = KP1_414213562 * (TE - TF);
Chris@42 190 }
Chris@42 191 {
Chris@42 192 E T5, TC, T3, TA;
Chris@42 193 {
Chris@42 194 E T4, TB, T1, T2;
Chris@42 195 T4 = Cr[WS(csr, 4)];
Chris@42 196 T5 = KP2_000000000 * T4;
Chris@42 197 TB = Ci[WS(csi, 4)];
Chris@42 198 TC = KP2_000000000 * TB;
Chris@42 199 T1 = Cr[0];
Chris@42 200 T2 = Cr[WS(csr, 8)];
Chris@42 201 T3 = T1 + T2;
Chris@42 202 TA = T1 - T2;
Chris@42 203 }
Chris@42 204 T6 = T3 + T5;
Chris@42 205 TR = TA + TC;
Chris@42 206 Ti = T3 - T5;
Chris@42 207 TD = TA - TC;
Chris@42 208 }
Chris@42 209 {
Chris@42 210 E TI, TM, TL, TJ;
Chris@42 211 {
Chris@42 212 E Tb, Tc, To, Tp;
Chris@42 213 Tb = Cr[WS(csr, 1)];
Chris@42 214 Tc = Cr[WS(csr, 7)];
Chris@42 215 Td = Tb + Tc;
Chris@42 216 TI = Tb - Tc;
Chris@42 217 To = Ci[WS(csi, 1)];
Chris@42 218 Tp = Ci[WS(csi, 7)];
Chris@42 219 Tq = To - Tp;
Chris@42 220 TM = To + Tp;
Chris@42 221 }
Chris@42 222 {
Chris@42 223 E Te, Tf, Tr, Ts;
Chris@42 224 Te = Cr[WS(csr, 5)];
Chris@42 225 Tf = Cr[WS(csr, 3)];
Chris@42 226 Tg = Te + Tf;
Chris@42 227 TL = Te - Tf;
Chris@42 228 Tr = Ci[WS(csi, 5)];
Chris@42 229 Ts = Ci[WS(csi, 3)];
Chris@42 230 Tt = Tr - Ts;
Chris@42 231 TJ = Tr + Ts;
Chris@42 232 }
Chris@42 233 Tn = Td - Tg;
Chris@42 234 Tu = Tq - Tt;
Chris@42 235 TV = TM - TL;
Chris@42 236 TU = TI + TJ;
Chris@42 237 TN = TL + TM;
Chris@42 238 TK = TI - TJ;
Chris@42 239 }
Chris@42 240 {
Chris@42 241 E Ta, Th, TT, TW;
Chris@42 242 Ta = T6 + T9;
Chris@42 243 Th = KP2_000000000 * (Td + Tg);
Chris@42 244 R0[WS(rs, 4)] = Ta - Th;
Chris@42 245 R0[0] = Ta + Th;
Chris@42 246 TT = TR - TS;
Chris@42 247 TW = FNMS(KP1_847759065, TV, KP765366864 * TU);
Chris@42 248 R1[WS(rs, 5)] = TT - TW;
Chris@42 249 R1[WS(rs, 1)] = TT + TW;
Chris@42 250 }
Chris@42 251 {
Chris@42 252 E TX, TY, Tm, Tv;
Chris@42 253 TX = TR + TS;
Chris@42 254 TY = FMA(KP1_847759065, TU, KP765366864 * TV);
Chris@42 255 R1[WS(rs, 3)] = TX - TY;
Chris@42 256 R1[WS(rs, 7)] = TX + TY;
Chris@42 257 Tm = Ti - Tl;
Chris@42 258 Tv = KP1_414213562 * (Tn - Tu);
Chris@42 259 R0[WS(rs, 5)] = Tm - Tv;
Chris@42 260 R0[WS(rs, 1)] = Tm + Tv;
Chris@42 261 }
Chris@42 262 {
Chris@42 263 E Tw, Tx, TH, TO;
Chris@42 264 Tw = Ti + Tl;
Chris@42 265 Tx = KP1_414213562 * (Tn + Tu);
Chris@42 266 R0[WS(rs, 3)] = Tw - Tx;
Chris@42 267 R0[WS(rs, 7)] = Tw + Tx;
Chris@42 268 TH = TD + TG;
Chris@42 269 TO = FNMS(KP765366864, TN, KP1_847759065 * TK);
Chris@42 270 R1[WS(rs, 4)] = TH - TO;
Chris@42 271 R1[0] = TH + TO;
Chris@42 272 }
Chris@42 273 {
Chris@42 274 E TP, TQ, Ty, Tz;
Chris@42 275 TP = TD - TG;
Chris@42 276 TQ = FMA(KP765366864, TK, KP1_847759065 * TN);
Chris@42 277 R1[WS(rs, 2)] = TP - TQ;
Chris@42 278 R1[WS(rs, 6)] = TP + TQ;
Chris@42 279 Ty = T6 - T9;
Chris@42 280 Tz = KP2_000000000 * (Tt + Tq);
Chris@42 281 R0[WS(rs, 2)] = Ty - Tz;
Chris@42 282 R0[WS(rs, 6)] = Ty + Tz;
Chris@42 283 }
Chris@42 284 }
Chris@42 285 }
Chris@42 286 }
Chris@42 287
Chris@42 288 static const kr2c_desc desc = { 16, "r2cb_16", {54, 14, 4, 0}, &GENUS };
Chris@42 289
Chris@42 290 void X(codelet_r2cb_16) (planner *p) {
Chris@42 291 X(kr2c_register) (p, r2cb_16, &desc);
Chris@42 292 }
Chris@42 293
Chris@42 294 #endif /* HAVE_FMA */