annotate src/fftw-3.3.5/rdft/scalar/r2cb/r2cbIII_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:50:42 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cbIII_16 -dft-III -include r2cbIII.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 46 additions, 16 multiplications, 20 fused multiply/add),
Chris@42 33 * 55 stack variables, 9 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cbIII.h"
Chris@42 36
Chris@42 37 static void r2cbIII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 40 DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
Chris@42 41 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 42 DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
Chris@42 43 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 44 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@42 45 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 46 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@42 47 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 48 {
Chris@42 49 INT i;
Chris@42 50 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@42 51 E TA, TD, Tv, TG, TE, TF;
Chris@42 52 {
Chris@42 53 E TK, TP, T7, T13, TW, TH, Tj, TC, To, Te, TX, TS, T12, Tt, TB;
Chris@42 54 {
Chris@42 55 E T4, Tf, T3, TU, Tz, T5, Tg, Th;
Chris@42 56 {
Chris@42 57 E T1, T2, Tx, Ty;
Chris@42 58 T1 = Cr[0];
Chris@42 59 T2 = Cr[WS(csr, 7)];
Chris@42 60 Tx = Ci[0];
Chris@42 61 Ty = Ci[WS(csi, 7)];
Chris@42 62 T4 = Cr[WS(csr, 4)];
Chris@42 63 Tf = T1 - T2;
Chris@42 64 T3 = T1 + T2;
Chris@42 65 TU = Ty - Tx;
Chris@42 66 Tz = Tx + Ty;
Chris@42 67 T5 = Cr[WS(csr, 3)];
Chris@42 68 Tg = Ci[WS(csi, 4)];
Chris@42 69 Th = Ci[WS(csi, 3)];
Chris@42 70 }
Chris@42 71 {
Chris@42 72 E Tb, Tk, Ta, TR, Tn, Tc, Tq, Tr;
Chris@42 73 {
Chris@42 74 E T8, T9, Tl, Tm;
Chris@42 75 T8 = Cr[WS(csr, 2)];
Chris@42 76 {
Chris@42 77 E Tw, T6, TV, Ti;
Chris@42 78 Tw = T4 - T5;
Chris@42 79 T6 = T4 + T5;
Chris@42 80 TV = Th - Tg;
Chris@42 81 Ti = Tg + Th;
Chris@42 82 TK = Tw - Tz;
Chris@42 83 TA = Tw + Tz;
Chris@42 84 TP = T3 - T6;
Chris@42 85 T7 = T3 + T6;
Chris@42 86 T13 = TV + TU;
Chris@42 87 TW = TU - TV;
Chris@42 88 TH = Tf + Ti;
Chris@42 89 Tj = Tf - Ti;
Chris@42 90 T9 = Cr[WS(csr, 5)];
Chris@42 91 }
Chris@42 92 Tl = Ci[WS(csi, 2)];
Chris@42 93 Tm = Ci[WS(csi, 5)];
Chris@42 94 Tb = Cr[WS(csr, 1)];
Chris@42 95 Tk = T8 - T9;
Chris@42 96 Ta = T8 + T9;
Chris@42 97 TR = Tl - Tm;
Chris@42 98 Tn = Tl + Tm;
Chris@42 99 Tc = Cr[WS(csr, 6)];
Chris@42 100 Tq = Ci[WS(csi, 1)];
Chris@42 101 Tr = Ci[WS(csi, 6)];
Chris@42 102 }
Chris@42 103 TC = Tk + Tn;
Chris@42 104 To = Tk - Tn;
Chris@42 105 {
Chris@42 106 E Tp, Td, TQ, Ts;
Chris@42 107 Tp = Tb - Tc;
Chris@42 108 Td = Tb + Tc;
Chris@42 109 TQ = Tr - Tq;
Chris@42 110 Ts = Tq + Tr;
Chris@42 111 Te = Ta + Td;
Chris@42 112 TX = Ta - Td;
Chris@42 113 TS = TQ - TR;
Chris@42 114 T12 = TR + TQ;
Chris@42 115 Tt = Tp - Ts;
Chris@42 116 TB = Tp + Ts;
Chris@42 117 }
Chris@42 118 }
Chris@42 119 }
Chris@42 120 {
Chris@42 121 E T10, TT, TY, TZ;
Chris@42 122 R0[0] = KP2_000000000 * (T7 + Te);
Chris@42 123 R0[WS(rs, 4)] = KP2_000000000 * (T13 - T12);
Chris@42 124 T10 = TP - TS;
Chris@42 125 TT = TP + TS;
Chris@42 126 TY = TW - TX;
Chris@42 127 TZ = TX + TW;
Chris@42 128 {
Chris@42 129 E T11, T14, TI, TL, Tu;
Chris@42 130 T11 = T7 - Te;
Chris@42 131 T14 = T12 + T13;
Chris@42 132 R0[WS(rs, 5)] = KP1_847759065 * (FNMS(KP414213562, TT, TY));
Chris@42 133 R0[WS(rs, 1)] = KP1_847759065 * (FMA(KP414213562, TY, TT));
Chris@42 134 R0[WS(rs, 6)] = KP1_414213562 * (T14 - T11);
Chris@42 135 R0[WS(rs, 2)] = KP1_414213562 * (T11 + T14);
Chris@42 136 TD = TB - TC;
Chris@42 137 TI = TC + TB;
Chris@42 138 TL = To - Tt;
Chris@42 139 Tu = To + Tt;
Chris@42 140 {
Chris@42 141 E TO, TJ, TN, TM;
Chris@42 142 R0[WS(rs, 7)] = -(KP1_847759065 * (FNMS(KP414213562, TZ, T10)));
Chris@42 143 R0[WS(rs, 3)] = KP1_847759065 * (FMA(KP414213562, T10, TZ));
Chris@42 144 TO = FMA(KP707106781, TI, TH);
Chris@42 145 TJ = FNMS(KP707106781, TI, TH);
Chris@42 146 TN = FMA(KP707106781, TL, TK);
Chris@42 147 TM = FNMS(KP707106781, TL, TK);
Chris@42 148 Tv = FMA(KP707106781, Tu, Tj);
Chris@42 149 TG = FNMS(KP707106781, Tu, Tj);
Chris@42 150 R1[WS(rs, 3)] = KP1_961570560 * (FMA(KP198912367, TO, TN));
Chris@42 151 R1[WS(rs, 7)] = -(KP1_961570560 * (FNMS(KP198912367, TN, TO)));
Chris@42 152 R1[WS(rs, 5)] = KP1_662939224 * (FNMS(KP668178637, TJ, TM));
Chris@42 153 R1[WS(rs, 1)] = KP1_662939224 * (FMA(KP668178637, TM, TJ));
Chris@42 154 }
Chris@42 155 }
Chris@42 156 }
Chris@42 157 }
Chris@42 158 TE = FNMS(KP707106781, TD, TA);
Chris@42 159 TF = FMA(KP707106781, TD, TA);
Chris@42 160 R1[WS(rs, 2)] = -(KP1_662939224 * (FNMS(KP668178637, TG, TF)));
Chris@42 161 R1[WS(rs, 6)] = -(KP1_662939224 * (FMA(KP668178637, TF, TG)));
Chris@42 162 R1[WS(rs, 4)] = -(KP1_961570560 * (FMA(KP198912367, Tv, TE)));
Chris@42 163 R1[0] = KP1_961570560 * (FNMS(KP198912367, TE, Tv));
Chris@42 164 }
Chris@42 165 }
Chris@42 166 }
Chris@42 167
Chris@42 168 static const kr2c_desc desc = { 16, "r2cbIII_16", {46, 16, 20, 0}, &GENUS };
Chris@42 169
Chris@42 170 void X(codelet_r2cbIII_16) (planner *p) {
Chris@42 171 X(kr2c_register) (p, r2cbIII_16, &desc);
Chris@42 172 }
Chris@42 173
Chris@42 174 #else /* HAVE_FMA */
Chris@42 175
Chris@42 176 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -name r2cbIII_16 -dft-III -include r2cbIII.h */
Chris@42 177
Chris@42 178 /*
Chris@42 179 * This function contains 66 FP additions, 32 FP multiplications,
Chris@42 180 * (or, 54 additions, 20 multiplications, 12 fused multiply/add),
Chris@42 181 * 40 stack variables, 9 constants, and 32 memory accesses
Chris@42 182 */
Chris@42 183 #include "r2cbIII.h"
Chris@42 184
Chris@42 185 static void r2cbIII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 186 {
Chris@42 187 DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
Chris@42 188 DK(KP390180644, +0.390180644032256535696569736954044481855383236);
Chris@42 189 DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
Chris@42 190 DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
Chris@42 191 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 192 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@42 193 DK(KP765366864, +0.765366864730179543456919968060797733522689125);
Chris@42 194 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@42 195 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 196 {
Chris@42 197 INT i;
Chris@42 198 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@42 199 E T7, TW, T13, Tj, TD, TK, TP, TH, Te, TX, T12, To, Tt, Tx, TS;
Chris@42 200 E Tw, TT, TY;
Chris@42 201 {
Chris@42 202 E T3, Tf, TC, TV, T6, Tz, Ti, TU;
Chris@42 203 {
Chris@42 204 E T1, T2, TA, TB;
Chris@42 205 T1 = Cr[0];
Chris@42 206 T2 = Cr[WS(csr, 7)];
Chris@42 207 T3 = T1 + T2;
Chris@42 208 Tf = T1 - T2;
Chris@42 209 TA = Ci[0];
Chris@42 210 TB = Ci[WS(csi, 7)];
Chris@42 211 TC = TA + TB;
Chris@42 212 TV = TB - TA;
Chris@42 213 }
Chris@42 214 {
Chris@42 215 E T4, T5, Tg, Th;
Chris@42 216 T4 = Cr[WS(csr, 4)];
Chris@42 217 T5 = Cr[WS(csr, 3)];
Chris@42 218 T6 = T4 + T5;
Chris@42 219 Tz = T4 - T5;
Chris@42 220 Tg = Ci[WS(csi, 4)];
Chris@42 221 Th = Ci[WS(csi, 3)];
Chris@42 222 Ti = Tg + Th;
Chris@42 223 TU = Tg - Th;
Chris@42 224 }
Chris@42 225 T7 = T3 + T6;
Chris@42 226 TW = TU + TV;
Chris@42 227 T13 = TV - TU;
Chris@42 228 Tj = Tf - Ti;
Chris@42 229 TD = Tz + TC;
Chris@42 230 TK = Tz - TC;
Chris@42 231 TP = T3 - T6;
Chris@42 232 TH = Tf + Ti;
Chris@42 233 }
Chris@42 234 {
Chris@42 235 E Ta, Tk, Tn, TR, Td, Tp, Ts, TQ;
Chris@42 236 {
Chris@42 237 E T8, T9, Tl, Tm;
Chris@42 238 T8 = Cr[WS(csr, 2)];
Chris@42 239 T9 = Cr[WS(csr, 5)];
Chris@42 240 Ta = T8 + T9;
Chris@42 241 Tk = T8 - T9;
Chris@42 242 Tl = Ci[WS(csi, 2)];
Chris@42 243 Tm = Ci[WS(csi, 5)];
Chris@42 244 Tn = Tl + Tm;
Chris@42 245 TR = Tl - Tm;
Chris@42 246 }
Chris@42 247 {
Chris@42 248 E Tb, Tc, Tq, Tr;
Chris@42 249 Tb = Cr[WS(csr, 1)];
Chris@42 250 Tc = Cr[WS(csr, 6)];
Chris@42 251 Td = Tb + Tc;
Chris@42 252 Tp = Tb - Tc;
Chris@42 253 Tq = Ci[WS(csi, 1)];
Chris@42 254 Tr = Ci[WS(csi, 6)];
Chris@42 255 Ts = Tq + Tr;
Chris@42 256 TQ = Tr - Tq;
Chris@42 257 }
Chris@42 258 Te = Ta + Td;
Chris@42 259 TX = Ta - Td;
Chris@42 260 T12 = TR + TQ;
Chris@42 261 To = Tk - Tn;
Chris@42 262 Tt = Tp - Ts;
Chris@42 263 Tx = Tp + Ts;
Chris@42 264 TS = TQ - TR;
Chris@42 265 Tw = Tk + Tn;
Chris@42 266 }
Chris@42 267 R0[0] = KP2_000000000 * (T7 + Te);
Chris@42 268 R0[WS(rs, 4)] = KP2_000000000 * (T13 - T12);
Chris@42 269 TT = TP + TS;
Chris@42 270 TY = TW - TX;
Chris@42 271 R0[WS(rs, 1)] = FMA(KP1_847759065, TT, KP765366864 * TY);
Chris@42 272 R0[WS(rs, 5)] = FNMS(KP765366864, TT, KP1_847759065 * TY);
Chris@42 273 {
Chris@42 274 E T11, T14, TZ, T10;
Chris@42 275 T11 = T7 - Te;
Chris@42 276 T14 = T12 + T13;
Chris@42 277 R0[WS(rs, 2)] = KP1_414213562 * (T11 + T14);
Chris@42 278 R0[WS(rs, 6)] = KP1_414213562 * (T14 - T11);
Chris@42 279 TZ = TP - TS;
Chris@42 280 T10 = TX + TW;
Chris@42 281 R0[WS(rs, 3)] = FMA(KP765366864, TZ, KP1_847759065 * T10);
Chris@42 282 R0[WS(rs, 7)] = FNMS(KP1_847759065, TZ, KP765366864 * T10);
Chris@42 283 }
Chris@42 284 {
Chris@42 285 E TJ, TN, TM, TO, TI, TL;
Chris@42 286 TI = KP707106781 * (Tw + Tx);
Chris@42 287 TJ = TH - TI;
Chris@42 288 TN = TH + TI;
Chris@42 289 TL = KP707106781 * (To - Tt);
Chris@42 290 TM = TK - TL;
Chris@42 291 TO = TL + TK;
Chris@42 292 R1[WS(rs, 1)] = FMA(KP1_662939224, TJ, KP1_111140466 * TM);
Chris@42 293 R1[WS(rs, 7)] = FNMS(KP1_961570560, TN, KP390180644 * TO);
Chris@42 294 R1[WS(rs, 5)] = FNMS(KP1_111140466, TJ, KP1_662939224 * TM);
Chris@42 295 R1[WS(rs, 3)] = FMA(KP390180644, TN, KP1_961570560 * TO);
Chris@42 296 }
Chris@42 297 {
Chris@42 298 E Tv, TF, TE, TG, Tu, Ty;
Chris@42 299 Tu = KP707106781 * (To + Tt);
Chris@42 300 Tv = Tj + Tu;
Chris@42 301 TF = Tj - Tu;
Chris@42 302 Ty = KP707106781 * (Tw - Tx);
Chris@42 303 TE = Ty + TD;
Chris@42 304 TG = Ty - TD;
Chris@42 305 R1[0] = FNMS(KP390180644, TE, KP1_961570560 * Tv);
Chris@42 306 R1[WS(rs, 6)] = FNMS(KP1_662939224, TF, KP1_111140466 * TG);
Chris@42 307 R1[WS(rs, 4)] = -(FMA(KP390180644, Tv, KP1_961570560 * TE));
Chris@42 308 R1[WS(rs, 2)] = FMA(KP1_111140466, TF, KP1_662939224 * TG);
Chris@42 309 }
Chris@42 310 }
Chris@42 311 }
Chris@42 312 }
Chris@42 313
Chris@42 314 static const kr2c_desc desc = { 16, "r2cbIII_16", {54, 20, 12, 0}, &GENUS };
Chris@42 315
Chris@42 316 void X(codelet_r2cbIII_16) (planner *p) {
Chris@42 317 X(kr2c_register) (p, r2cbIII_16, &desc);
Chris@42 318 }
Chris@42 319
Chris@42 320 #endif /* HAVE_FMA */