annotate src/fftw-3.3.8/rdft/scalar/r2cb/r2cbIII_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:44 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cbIII_16 -dft-III -include rdft/scalar/r2cbIII.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@82 32 * (or, 46 additions, 16 multiplications, 20 fused multiply/add),
Chris@82 33 * 40 stack variables, 9 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/r2cbIII.h"
Chris@82 36
Chris@82 37 static void r2cbIII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 40 DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
Chris@82 41 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 42 DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
Chris@82 43 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 44 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@82 45 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 46 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@82 47 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@82 48 {
Chris@82 49 INT i;
Chris@82 50 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@82 51 E T7, TW, T13, Tj, TA, TK, TP, TH, Te, TX, T12, To, Tt, TC, TS;
Chris@82 52 E TB, TT, TY;
Chris@82 53 {
Chris@82 54 E T3, Tf, Tz, TU, T6, Tw, Ti, TV;
Chris@82 55 {
Chris@82 56 E T1, T2, Tx, Ty;
Chris@82 57 T1 = Cr[0];
Chris@82 58 T2 = Cr[WS(csr, 7)];
Chris@82 59 T3 = T1 + T2;
Chris@82 60 Tf = T1 - T2;
Chris@82 61 Tx = Ci[0];
Chris@82 62 Ty = Ci[WS(csi, 7)];
Chris@82 63 Tz = Tx + Ty;
Chris@82 64 TU = Ty - Tx;
Chris@82 65 }
Chris@82 66 {
Chris@82 67 E T4, T5, Tg, Th;
Chris@82 68 T4 = Cr[WS(csr, 4)];
Chris@82 69 T5 = Cr[WS(csr, 3)];
Chris@82 70 T6 = T4 + T5;
Chris@82 71 Tw = T4 - T5;
Chris@82 72 Tg = Ci[WS(csi, 4)];
Chris@82 73 Th = Ci[WS(csi, 3)];
Chris@82 74 Ti = Tg + Th;
Chris@82 75 TV = Th - Tg;
Chris@82 76 }
Chris@82 77 T7 = T3 + T6;
Chris@82 78 TW = TU - TV;
Chris@82 79 T13 = TV + TU;
Chris@82 80 Tj = Tf - Ti;
Chris@82 81 TA = Tw + Tz;
Chris@82 82 TK = Tw - Tz;
Chris@82 83 TP = T3 - T6;
Chris@82 84 TH = Tf + Ti;
Chris@82 85 }
Chris@82 86 {
Chris@82 87 E Ta, Tk, Tn, TR, Td, Tp, Ts, TQ;
Chris@82 88 {
Chris@82 89 E T8, T9, Tl, Tm;
Chris@82 90 T8 = Cr[WS(csr, 2)];
Chris@82 91 T9 = Cr[WS(csr, 5)];
Chris@82 92 Ta = T8 + T9;
Chris@82 93 Tk = T8 - T9;
Chris@82 94 Tl = Ci[WS(csi, 2)];
Chris@82 95 Tm = Ci[WS(csi, 5)];
Chris@82 96 Tn = Tl + Tm;
Chris@82 97 TR = Tl - Tm;
Chris@82 98 }
Chris@82 99 {
Chris@82 100 E Tb, Tc, Tq, Tr;
Chris@82 101 Tb = Cr[WS(csr, 1)];
Chris@82 102 Tc = Cr[WS(csr, 6)];
Chris@82 103 Td = Tb + Tc;
Chris@82 104 Tp = Tb - Tc;
Chris@82 105 Tq = Ci[WS(csi, 1)];
Chris@82 106 Tr = Ci[WS(csi, 6)];
Chris@82 107 Ts = Tq + Tr;
Chris@82 108 TQ = Tr - Tq;
Chris@82 109 }
Chris@82 110 Te = Ta + Td;
Chris@82 111 TX = Ta - Td;
Chris@82 112 T12 = TR + TQ;
Chris@82 113 To = Tk - Tn;
Chris@82 114 Tt = Tp - Ts;
Chris@82 115 TC = Tk + Tn;
Chris@82 116 TS = TQ - TR;
Chris@82 117 TB = Tp + Ts;
Chris@82 118 }
Chris@82 119 R0[0] = KP2_000000000 * (T7 + Te);
Chris@82 120 R0[WS(rs, 4)] = KP2_000000000 * (T13 - T12);
Chris@82 121 TT = TP + TS;
Chris@82 122 TY = TW - TX;
Chris@82 123 R0[WS(rs, 1)] = KP1_847759065 * (FMA(KP414213562, TY, TT));
Chris@82 124 R0[WS(rs, 5)] = KP1_847759065 * (FNMS(KP414213562, TT, TY));
Chris@82 125 {
Chris@82 126 E T11, T14, TZ, T10;
Chris@82 127 T11 = T7 - Te;
Chris@82 128 T14 = T12 + T13;
Chris@82 129 R0[WS(rs, 2)] = KP1_414213562 * (T11 + T14);
Chris@82 130 R0[WS(rs, 6)] = KP1_414213562 * (T14 - T11);
Chris@82 131 TZ = TX + TW;
Chris@82 132 T10 = TP - TS;
Chris@82 133 R0[WS(rs, 3)] = KP1_847759065 * (FMA(KP414213562, T10, TZ));
Chris@82 134 R0[WS(rs, 7)] = -(KP1_847759065 * (FNMS(KP414213562, TZ, T10)));
Chris@82 135 }
Chris@82 136 {
Chris@82 137 E TJ, TO, TM, TN, TI, TL;
Chris@82 138 TI = TC + TB;
Chris@82 139 TJ = FNMS(KP707106781, TI, TH);
Chris@82 140 TO = FMA(KP707106781, TI, TH);
Chris@82 141 TL = To - Tt;
Chris@82 142 TM = FNMS(KP707106781, TL, TK);
Chris@82 143 TN = FMA(KP707106781, TL, TK);
Chris@82 144 R1[WS(rs, 1)] = KP1_662939224 * (FMA(KP668178637, TM, TJ));
Chris@82 145 R1[WS(rs, 7)] = -(KP1_961570560 * (FNMS(KP198912367, TN, TO)));
Chris@82 146 R1[WS(rs, 5)] = KP1_662939224 * (FNMS(KP668178637, TJ, TM));
Chris@82 147 R1[WS(rs, 3)] = KP1_961570560 * (FMA(KP198912367, TO, TN));
Chris@82 148 }
Chris@82 149 {
Chris@82 150 E Tv, TG, TE, TF, Tu, TD;
Chris@82 151 Tu = To + Tt;
Chris@82 152 Tv = FMA(KP707106781, Tu, Tj);
Chris@82 153 TG = FNMS(KP707106781, Tu, Tj);
Chris@82 154 TD = TB - TC;
Chris@82 155 TE = FNMS(KP707106781, TD, TA);
Chris@82 156 TF = FMA(KP707106781, TD, TA);
Chris@82 157 R1[0] = KP1_961570560 * (FNMS(KP198912367, TE, Tv));
Chris@82 158 R1[WS(rs, 6)] = -(KP1_662939224 * (FMA(KP668178637, TF, TG)));
Chris@82 159 R1[WS(rs, 4)] = -(KP1_961570560 * (FMA(KP198912367, Tv, TE)));
Chris@82 160 R1[WS(rs, 2)] = -(KP1_662939224 * (FNMS(KP668178637, TG, TF)));
Chris@82 161 }
Chris@82 162 }
Chris@82 163 }
Chris@82 164 }
Chris@82 165
Chris@82 166 static const kr2c_desc desc = { 16, "r2cbIII_16", {46, 16, 20, 0}, &GENUS };
Chris@82 167
Chris@82 168 void X(codelet_r2cbIII_16) (planner *p) {
Chris@82 169 X(kr2c_register) (p, r2cbIII_16, &desc);
Chris@82 170 }
Chris@82 171
Chris@82 172 #else
Chris@82 173
Chris@82 174 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cbIII_16 -dft-III -include rdft/scalar/r2cbIII.h */
Chris@82 175
Chris@82 176 /*
Chris@82 177 * This function contains 66 FP additions, 32 FP multiplications,
Chris@82 178 * (or, 54 additions, 20 multiplications, 12 fused multiply/add),
Chris@82 179 * 40 stack variables, 9 constants, and 32 memory accesses
Chris@82 180 */
Chris@82 181 #include "rdft/scalar/r2cbIII.h"
Chris@82 182
Chris@82 183 static void r2cbIII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@82 184 {
Chris@82 185 DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
Chris@82 186 DK(KP390180644, +0.390180644032256535696569736954044481855383236);
Chris@82 187 DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
Chris@82 188 DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
Chris@82 189 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 190 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@82 191 DK(KP765366864, +0.765366864730179543456919968060797733522689125);
Chris@82 192 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@82 193 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@82 194 {
Chris@82 195 INT i;
Chris@82 196 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@82 197 E T7, TW, T13, Tj, TD, TK, TP, TH, Te, TX, T12, To, Tt, Tx, TS;
Chris@82 198 E Tw, TT, TY;
Chris@82 199 {
Chris@82 200 E T3, Tf, TC, TV, T6, Tz, Ti, TU;
Chris@82 201 {
Chris@82 202 E T1, T2, TA, TB;
Chris@82 203 T1 = Cr[0];
Chris@82 204 T2 = Cr[WS(csr, 7)];
Chris@82 205 T3 = T1 + T2;
Chris@82 206 Tf = T1 - T2;
Chris@82 207 TA = Ci[0];
Chris@82 208 TB = Ci[WS(csi, 7)];
Chris@82 209 TC = TA + TB;
Chris@82 210 TV = TB - TA;
Chris@82 211 }
Chris@82 212 {
Chris@82 213 E T4, T5, Tg, Th;
Chris@82 214 T4 = Cr[WS(csr, 4)];
Chris@82 215 T5 = Cr[WS(csr, 3)];
Chris@82 216 T6 = T4 + T5;
Chris@82 217 Tz = T4 - T5;
Chris@82 218 Tg = Ci[WS(csi, 4)];
Chris@82 219 Th = Ci[WS(csi, 3)];
Chris@82 220 Ti = Tg + Th;
Chris@82 221 TU = Tg - Th;
Chris@82 222 }
Chris@82 223 T7 = T3 + T6;
Chris@82 224 TW = TU + TV;
Chris@82 225 T13 = TV - TU;
Chris@82 226 Tj = Tf - Ti;
Chris@82 227 TD = Tz + TC;
Chris@82 228 TK = Tz - TC;
Chris@82 229 TP = T3 - T6;
Chris@82 230 TH = Tf + Ti;
Chris@82 231 }
Chris@82 232 {
Chris@82 233 E Ta, Tk, Tn, TR, Td, Tp, Ts, TQ;
Chris@82 234 {
Chris@82 235 E T8, T9, Tl, Tm;
Chris@82 236 T8 = Cr[WS(csr, 2)];
Chris@82 237 T9 = Cr[WS(csr, 5)];
Chris@82 238 Ta = T8 + T9;
Chris@82 239 Tk = T8 - T9;
Chris@82 240 Tl = Ci[WS(csi, 2)];
Chris@82 241 Tm = Ci[WS(csi, 5)];
Chris@82 242 Tn = Tl + Tm;
Chris@82 243 TR = Tl - Tm;
Chris@82 244 }
Chris@82 245 {
Chris@82 246 E Tb, Tc, Tq, Tr;
Chris@82 247 Tb = Cr[WS(csr, 1)];
Chris@82 248 Tc = Cr[WS(csr, 6)];
Chris@82 249 Td = Tb + Tc;
Chris@82 250 Tp = Tb - Tc;
Chris@82 251 Tq = Ci[WS(csi, 1)];
Chris@82 252 Tr = Ci[WS(csi, 6)];
Chris@82 253 Ts = Tq + Tr;
Chris@82 254 TQ = Tr - Tq;
Chris@82 255 }
Chris@82 256 Te = Ta + Td;
Chris@82 257 TX = Ta - Td;
Chris@82 258 T12 = TR + TQ;
Chris@82 259 To = Tk - Tn;
Chris@82 260 Tt = Tp - Ts;
Chris@82 261 Tx = Tp + Ts;
Chris@82 262 TS = TQ - TR;
Chris@82 263 Tw = Tk + Tn;
Chris@82 264 }
Chris@82 265 R0[0] = KP2_000000000 * (T7 + Te);
Chris@82 266 R0[WS(rs, 4)] = KP2_000000000 * (T13 - T12);
Chris@82 267 TT = TP + TS;
Chris@82 268 TY = TW - TX;
Chris@82 269 R0[WS(rs, 1)] = FMA(KP1_847759065, TT, KP765366864 * TY);
Chris@82 270 R0[WS(rs, 5)] = FNMS(KP765366864, TT, KP1_847759065 * TY);
Chris@82 271 {
Chris@82 272 E T11, T14, TZ, T10;
Chris@82 273 T11 = T7 - Te;
Chris@82 274 T14 = T12 + T13;
Chris@82 275 R0[WS(rs, 2)] = KP1_414213562 * (T11 + T14);
Chris@82 276 R0[WS(rs, 6)] = KP1_414213562 * (T14 - T11);
Chris@82 277 TZ = TP - TS;
Chris@82 278 T10 = TX + TW;
Chris@82 279 R0[WS(rs, 3)] = FMA(KP765366864, TZ, KP1_847759065 * T10);
Chris@82 280 R0[WS(rs, 7)] = FNMS(KP1_847759065, TZ, KP765366864 * T10);
Chris@82 281 }
Chris@82 282 {
Chris@82 283 E TJ, TN, TM, TO, TI, TL;
Chris@82 284 TI = KP707106781 * (Tw + Tx);
Chris@82 285 TJ = TH - TI;
Chris@82 286 TN = TH + TI;
Chris@82 287 TL = KP707106781 * (To - Tt);
Chris@82 288 TM = TK - TL;
Chris@82 289 TO = TL + TK;
Chris@82 290 R1[WS(rs, 1)] = FMA(KP1_662939224, TJ, KP1_111140466 * TM);
Chris@82 291 R1[WS(rs, 7)] = FNMS(KP1_961570560, TN, KP390180644 * TO);
Chris@82 292 R1[WS(rs, 5)] = FNMS(KP1_111140466, TJ, KP1_662939224 * TM);
Chris@82 293 R1[WS(rs, 3)] = FMA(KP390180644, TN, KP1_961570560 * TO);
Chris@82 294 }
Chris@82 295 {
Chris@82 296 E Tv, TF, TE, TG, Tu, Ty;
Chris@82 297 Tu = KP707106781 * (To + Tt);
Chris@82 298 Tv = Tj + Tu;
Chris@82 299 TF = Tj - Tu;
Chris@82 300 Ty = KP707106781 * (Tw - Tx);
Chris@82 301 TE = Ty + TD;
Chris@82 302 TG = Ty - TD;
Chris@82 303 R1[0] = FNMS(KP390180644, TE, KP1_961570560 * Tv);
Chris@82 304 R1[WS(rs, 6)] = FNMS(KP1_662939224, TF, KP1_111140466 * TG);
Chris@82 305 R1[WS(rs, 4)] = -(FMA(KP390180644, Tv, KP1_961570560 * TE));
Chris@82 306 R1[WS(rs, 2)] = FMA(KP1_111140466, TF, KP1_662939224 * TG);
Chris@82 307 }
Chris@82 308 }
Chris@82 309 }
Chris@82 310 }
Chris@82 311
Chris@82 312 static const kr2c_desc desc = { 16, "r2cbIII_16", {54, 20, 12, 0}, &GENUS };
Chris@82 313
Chris@82 314 void X(codelet_r2cbIII_16) (planner *p) {
Chris@82 315 X(kr2c_register) (p, r2cbIII_16, &desc);
Chris@82 316 }
Chris@82 317
Chris@82 318 #endif