annotate src/fftw-3.3.8/rdft/scalar/r2cb/r2cb_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:30 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -name r2cb_20 -include rdft/scalar/r2cb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 86 FP additions, 44 FP multiplications,
Chris@82 32 * (or, 42 additions, 0 multiplications, 44 fused multiply/add),
Chris@82 33 * 50 stack variables, 5 constants, and 40 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/r2cb.h"
Chris@82 36
Chris@82 37 static void r2cb_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
Chris@82 40 DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
Chris@82 41 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@82 44 {
Chris@82 45 INT i;
Chris@82 46 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
Chris@82 47 E T5, TD, Tl, Tr, TO, T1l, T1d, T10, T1k, TT, T11, T1a, Tc, Tj, Tk;
Chris@82 48 E Tw, TB, TC, Tm, Tn, To, TE, TF, TG;
Chris@82 49 {
Chris@82 50 E T4, Tq, T3, Tp, T1, T2;
Chris@82 51 T4 = Cr[WS(csr, 5)];
Chris@82 52 Tq = Ci[WS(csi, 5)];
Chris@82 53 T1 = Cr[0];
Chris@82 54 T2 = Cr[WS(csr, 10)];
Chris@82 55 T3 = T1 + T2;
Chris@82 56 Tp = T1 - T2;
Chris@82 57 T5 = FNMS(KP2_000000000, T4, T3);
Chris@82 58 TD = FNMS(KP2_000000000, Tq, Tp);
Chris@82 59 Tl = FMA(KP2_000000000, T4, T3);
Chris@82 60 Tr = FMA(KP2_000000000, Tq, Tp);
Chris@82 61 }
Chris@82 62 {
Chris@82 63 E T8, Ts, TR, T19, Tb, T18, Tv, TS, Tf, Tx, TM, T1c, Ti, T1b, TA;
Chris@82 64 E TN;
Chris@82 65 {
Chris@82 66 E T6, T7, TP, TQ;
Chris@82 67 T6 = Cr[WS(csr, 4)];
Chris@82 68 T7 = Cr[WS(csr, 6)];
Chris@82 69 T8 = T6 + T7;
Chris@82 70 Ts = T6 - T7;
Chris@82 71 TP = Ci[WS(csi, 4)];
Chris@82 72 TQ = Ci[WS(csi, 6)];
Chris@82 73 TR = TP - TQ;
Chris@82 74 T19 = TP + TQ;
Chris@82 75 }
Chris@82 76 {
Chris@82 77 E T9, Ta, Tt, Tu;
Chris@82 78 T9 = Cr[WS(csr, 9)];
Chris@82 79 Ta = Cr[WS(csr, 1)];
Chris@82 80 Tb = T9 + Ta;
Chris@82 81 T18 = T9 - Ta;
Chris@82 82 Tt = Ci[WS(csi, 9)];
Chris@82 83 Tu = Ci[WS(csi, 1)];
Chris@82 84 Tv = Tt + Tu;
Chris@82 85 TS = Tt - Tu;
Chris@82 86 }
Chris@82 87 {
Chris@82 88 E Td, Te, TK, TL;
Chris@82 89 Td = Cr[WS(csr, 8)];
Chris@82 90 Te = Cr[WS(csr, 2)];
Chris@82 91 Tf = Td + Te;
Chris@82 92 Tx = Td - Te;
Chris@82 93 TK = Ci[WS(csi, 8)];
Chris@82 94 TL = Ci[WS(csi, 2)];
Chris@82 95 TM = TK - TL;
Chris@82 96 T1c = TK + TL;
Chris@82 97 }
Chris@82 98 {
Chris@82 99 E Tg, Th, Ty, Tz;
Chris@82 100 Tg = Cr[WS(csr, 7)];
Chris@82 101 Th = Cr[WS(csr, 3)];
Chris@82 102 Ti = Tg + Th;
Chris@82 103 T1b = Tg - Th;
Chris@82 104 Ty = Ci[WS(csi, 7)];
Chris@82 105 Tz = Ci[WS(csi, 3)];
Chris@82 106 TA = Ty + Tz;
Chris@82 107 TN = Tz - Ty;
Chris@82 108 }
Chris@82 109 TO = TM - TN;
Chris@82 110 T1l = T19 - T18;
Chris@82 111 T1d = T1b + T1c;
Chris@82 112 T10 = TS + TR;
Chris@82 113 T1k = T1c - T1b;
Chris@82 114 TT = TR - TS;
Chris@82 115 T11 = TN + TM;
Chris@82 116 T1a = T18 + T19;
Chris@82 117 Tc = T8 - Tb;
Chris@82 118 Tj = Tf - Ti;
Chris@82 119 Tk = Tc + Tj;
Chris@82 120 Tw = Ts + Tv;
Chris@82 121 TB = Tx - TA;
Chris@82 122 TC = Tw + TB;
Chris@82 123 Tm = T8 + Tb;
Chris@82 124 Tn = Tf + Ti;
Chris@82 125 To = Tm + Tn;
Chris@82 126 TE = Ts - Tv;
Chris@82 127 TF = Tx + TA;
Chris@82 128 TG = TE + TF;
Chris@82 129 }
Chris@82 130 R0[WS(rs, 5)] = FMA(KP2_000000000, Tk, T5);
Chris@82 131 R1[WS(rs, 7)] = FMA(KP2_000000000, TC, Tr);
Chris@82 132 R1[WS(rs, 2)] = FMA(KP2_000000000, TG, TD);
Chris@82 133 R0[0] = FMA(KP2_000000000, To, Tl);
Chris@82 134 {
Chris@82 135 E TU, TW, TJ, TV, TH, TI;
Chris@82 136 TU = FNMS(KP618033988, TT, TO);
Chris@82 137 TW = FMA(KP618033988, TO, TT);
Chris@82 138 TH = FNMS(KP500000000, Tk, T5);
Chris@82 139 TI = Tc - Tj;
Chris@82 140 TJ = FNMS(KP1_118033988, TI, TH);
Chris@82 141 TV = FMA(KP1_118033988, TI, TH);
Chris@82 142 R0[WS(rs, 9)] = FNMS(KP1_902113032, TU, TJ);
Chris@82 143 R0[WS(rs, 7)] = FMA(KP1_902113032, TW, TV);
Chris@82 144 R0[WS(rs, 1)] = FMA(KP1_902113032, TU, TJ);
Chris@82 145 R0[WS(rs, 3)] = FNMS(KP1_902113032, TW, TV);
Chris@82 146 }
Chris@82 147 {
Chris@82 148 E T1e, T1g, T17, T1f, T15, T16;
Chris@82 149 T1e = FMA(KP618033988, T1d, T1a);
Chris@82 150 T1g = FNMS(KP618033988, T1a, T1d);
Chris@82 151 T15 = FNMS(KP500000000, TG, TD);
Chris@82 152 T16 = TE - TF;
Chris@82 153 T17 = FMA(KP1_118033988, T16, T15);
Chris@82 154 T1f = FNMS(KP1_118033988, T16, T15);
Chris@82 155 R1[0] = FNMS(KP1_902113032, T1e, T17);
Chris@82 156 R1[WS(rs, 8)] = FMA(KP1_902113032, T1g, T1f);
Chris@82 157 R1[WS(rs, 4)] = FMA(KP1_902113032, T1e, T17);
Chris@82 158 R1[WS(rs, 6)] = FNMS(KP1_902113032, T1g, T1f);
Chris@82 159 }
Chris@82 160 {
Chris@82 161 E T1m, T1o, T1j, T1n, T1h, T1i;
Chris@82 162 T1m = FNMS(KP618033988, T1l, T1k);
Chris@82 163 T1o = FMA(KP618033988, T1k, T1l);
Chris@82 164 T1h = FNMS(KP500000000, TC, Tr);
Chris@82 165 T1i = Tw - TB;
Chris@82 166 T1j = FNMS(KP1_118033988, T1i, T1h);
Chris@82 167 T1n = FMA(KP1_118033988, T1i, T1h);
Chris@82 168 R1[WS(rs, 1)] = FNMS(KP1_902113032, T1m, T1j);
Chris@82 169 R1[WS(rs, 9)] = FMA(KP1_902113032, T1o, T1n);
Chris@82 170 R1[WS(rs, 3)] = FMA(KP1_902113032, T1m, T1j);
Chris@82 171 R1[WS(rs, 5)] = FNMS(KP1_902113032, T1o, T1n);
Chris@82 172 }
Chris@82 173 {
Chris@82 174 E T12, T14, TZ, T13, TX, TY;
Chris@82 175 T12 = FMA(KP618033988, T11, T10);
Chris@82 176 T14 = FNMS(KP618033988, T10, T11);
Chris@82 177 TX = FNMS(KP500000000, To, Tl);
Chris@82 178 TY = Tm - Tn;
Chris@82 179 TZ = FMA(KP1_118033988, TY, TX);
Chris@82 180 T13 = FNMS(KP1_118033988, TY, TX);
Chris@82 181 R0[WS(rs, 8)] = FNMS(KP1_902113032, T12, TZ);
Chris@82 182 R0[WS(rs, 6)] = FMA(KP1_902113032, T14, T13);
Chris@82 183 R0[WS(rs, 2)] = FMA(KP1_902113032, T12, TZ);
Chris@82 184 R0[WS(rs, 4)] = FNMS(KP1_902113032, T14, T13);
Chris@82 185 }
Chris@82 186 }
Chris@82 187 }
Chris@82 188 }
Chris@82 189
Chris@82 190 static const kr2c_desc desc = { 20, "r2cb_20", {42, 0, 44, 0}, &GENUS };
Chris@82 191
Chris@82 192 void X(codelet_r2cb_20) (planner *p) {
Chris@82 193 X(kr2c_register) (p, r2cb_20, &desc);
Chris@82 194 }
Chris@82 195
Chris@82 196 #else
Chris@82 197
Chris@82 198 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 20 -name r2cb_20 -include rdft/scalar/r2cb.h */
Chris@82 199
Chris@82 200 /*
Chris@82 201 * This function contains 86 FP additions, 30 FP multiplications,
Chris@82 202 * (or, 70 additions, 14 multiplications, 16 fused multiply/add),
Chris@82 203 * 50 stack variables, 5 constants, and 40 memory accesses
Chris@82 204 */
Chris@82 205 #include "rdft/scalar/r2cb.h"
Chris@82 206
Chris@82 207 static void r2cb_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@82 208 {
Chris@82 209 DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
Chris@82 210 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 211 DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
Chris@82 212 DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
Chris@82 213 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@82 214 {
Chris@82 215 INT i;
Chris@82 216 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
Chris@82 217 E T6, TF, Tm, Tt, TQ, T1n, T1f, T12, T1m, TV, T13, T1c, Td, Tk, Tl;
Chris@82 218 E Ty, TD, TE, Tn, To, Tp, TG, TH, TI;
Chris@82 219 {
Chris@82 220 E T5, Ts, T3, Tq;
Chris@82 221 {
Chris@82 222 E T4, Tr, T1, T2;
Chris@82 223 T4 = Cr[WS(csr, 5)];
Chris@82 224 T5 = KP2_000000000 * T4;
Chris@82 225 Tr = Ci[WS(csi, 5)];
Chris@82 226 Ts = KP2_000000000 * Tr;
Chris@82 227 T1 = Cr[0];
Chris@82 228 T2 = Cr[WS(csr, 10)];
Chris@82 229 T3 = T1 + T2;
Chris@82 230 Tq = T1 - T2;
Chris@82 231 }
Chris@82 232 T6 = T3 - T5;
Chris@82 233 TF = Tq - Ts;
Chris@82 234 Tm = T3 + T5;
Chris@82 235 Tt = Tq + Ts;
Chris@82 236 }
Chris@82 237 {
Chris@82 238 E T9, Tu, TO, T1b, Tc, T1a, Tx, TP, Tg, Tz, TT, T1e, Tj, T1d, TC;
Chris@82 239 E TU;
Chris@82 240 {
Chris@82 241 E T7, T8, TM, TN;
Chris@82 242 T7 = Cr[WS(csr, 4)];
Chris@82 243 T8 = Cr[WS(csr, 6)];
Chris@82 244 T9 = T7 + T8;
Chris@82 245 Tu = T7 - T8;
Chris@82 246 TM = Ci[WS(csi, 4)];
Chris@82 247 TN = Ci[WS(csi, 6)];
Chris@82 248 TO = TM - TN;
Chris@82 249 T1b = TM + TN;
Chris@82 250 }
Chris@82 251 {
Chris@82 252 E Ta, Tb, Tv, Tw;
Chris@82 253 Ta = Cr[WS(csr, 9)];
Chris@82 254 Tb = Cr[WS(csr, 1)];
Chris@82 255 Tc = Ta + Tb;
Chris@82 256 T1a = Ta - Tb;
Chris@82 257 Tv = Ci[WS(csi, 9)];
Chris@82 258 Tw = Ci[WS(csi, 1)];
Chris@82 259 Tx = Tv + Tw;
Chris@82 260 TP = Tv - Tw;
Chris@82 261 }
Chris@82 262 {
Chris@82 263 E Te, Tf, TR, TS;
Chris@82 264 Te = Cr[WS(csr, 8)];
Chris@82 265 Tf = Cr[WS(csr, 2)];
Chris@82 266 Tg = Te + Tf;
Chris@82 267 Tz = Te - Tf;
Chris@82 268 TR = Ci[WS(csi, 8)];
Chris@82 269 TS = Ci[WS(csi, 2)];
Chris@82 270 TT = TR - TS;
Chris@82 271 T1e = TR + TS;
Chris@82 272 }
Chris@82 273 {
Chris@82 274 E Th, Ti, TA, TB;
Chris@82 275 Th = Cr[WS(csr, 7)];
Chris@82 276 Ti = Cr[WS(csr, 3)];
Chris@82 277 Tj = Th + Ti;
Chris@82 278 T1d = Th - Ti;
Chris@82 279 TA = Ci[WS(csi, 7)];
Chris@82 280 TB = Ci[WS(csi, 3)];
Chris@82 281 TC = TA + TB;
Chris@82 282 TU = TB - TA;
Chris@82 283 }
Chris@82 284 TQ = TO - TP;
Chris@82 285 T1n = T1e - T1d;
Chris@82 286 T1f = T1d + T1e;
Chris@82 287 T12 = TP + TO;
Chris@82 288 T1m = T1b - T1a;
Chris@82 289 TV = TT - TU;
Chris@82 290 T13 = TU + TT;
Chris@82 291 T1c = T1a + T1b;
Chris@82 292 Td = T9 - Tc;
Chris@82 293 Tk = Tg - Tj;
Chris@82 294 Tl = Td + Tk;
Chris@82 295 Ty = Tu + Tx;
Chris@82 296 TD = Tz - TC;
Chris@82 297 TE = Ty + TD;
Chris@82 298 Tn = T9 + Tc;
Chris@82 299 To = Tg + Tj;
Chris@82 300 Tp = Tn + To;
Chris@82 301 TG = Tu - Tx;
Chris@82 302 TH = Tz + TC;
Chris@82 303 TI = TG + TH;
Chris@82 304 }
Chris@82 305 R0[WS(rs, 5)] = FMA(KP2_000000000, Tl, T6);
Chris@82 306 R1[WS(rs, 7)] = FMA(KP2_000000000, TE, Tt);
Chris@82 307 R1[WS(rs, 2)] = FMA(KP2_000000000, TI, TF);
Chris@82 308 R0[0] = FMA(KP2_000000000, Tp, Tm);
Chris@82 309 {
Chris@82 310 E TW, TY, TL, TX, TJ, TK;
Chris@82 311 TW = FNMS(KP1_902113032, TV, KP1_175570504 * TQ);
Chris@82 312 TY = FMA(KP1_902113032, TQ, KP1_175570504 * TV);
Chris@82 313 TJ = FNMS(KP500000000, Tl, T6);
Chris@82 314 TK = KP1_118033988 * (Td - Tk);
Chris@82 315 TL = TJ - TK;
Chris@82 316 TX = TK + TJ;
Chris@82 317 R0[WS(rs, 1)] = TL - TW;
Chris@82 318 R0[WS(rs, 7)] = TX + TY;
Chris@82 319 R0[WS(rs, 9)] = TL + TW;
Chris@82 320 R0[WS(rs, 3)] = TX - TY;
Chris@82 321 }
Chris@82 322 {
Chris@82 323 E T1g, T1i, T19, T1h, T17, T18;
Chris@82 324 T1g = FNMS(KP1_902113032, T1f, KP1_175570504 * T1c);
Chris@82 325 T1i = FMA(KP1_902113032, T1c, KP1_175570504 * T1f);
Chris@82 326 T17 = FNMS(KP500000000, TI, TF);
Chris@82 327 T18 = KP1_118033988 * (TG - TH);
Chris@82 328 T19 = T17 - T18;
Chris@82 329 T1h = T18 + T17;
Chris@82 330 R1[WS(rs, 8)] = T19 - T1g;
Chris@82 331 R1[WS(rs, 4)] = T1h + T1i;
Chris@82 332 R1[WS(rs, 6)] = T19 + T1g;
Chris@82 333 R1[0] = T1h - T1i;
Chris@82 334 }
Chris@82 335 {
Chris@82 336 E T1o, T1q, T1l, T1p, T1j, T1k;
Chris@82 337 T1o = FNMS(KP1_902113032, T1n, KP1_175570504 * T1m);
Chris@82 338 T1q = FMA(KP1_902113032, T1m, KP1_175570504 * T1n);
Chris@82 339 T1j = FNMS(KP500000000, TE, Tt);
Chris@82 340 T1k = KP1_118033988 * (Ty - TD);
Chris@82 341 T1l = T1j - T1k;
Chris@82 342 T1p = T1k + T1j;
Chris@82 343 R1[WS(rs, 3)] = T1l - T1o;
Chris@82 344 R1[WS(rs, 9)] = T1p + T1q;
Chris@82 345 R1[WS(rs, 1)] = T1l + T1o;
Chris@82 346 R1[WS(rs, 5)] = T1p - T1q;
Chris@82 347 }
Chris@82 348 {
Chris@82 349 E T14, T16, T11, T15, TZ, T10;
Chris@82 350 T14 = FNMS(KP1_902113032, T13, KP1_175570504 * T12);
Chris@82 351 T16 = FMA(KP1_902113032, T12, KP1_175570504 * T13);
Chris@82 352 TZ = FNMS(KP500000000, Tp, Tm);
Chris@82 353 T10 = KP1_118033988 * (Tn - To);
Chris@82 354 T11 = TZ - T10;
Chris@82 355 T15 = T10 + TZ;
Chris@82 356 R0[WS(rs, 6)] = T11 - T14;
Chris@82 357 R0[WS(rs, 2)] = T15 + T16;
Chris@82 358 R0[WS(rs, 4)] = T11 + T14;
Chris@82 359 R0[WS(rs, 8)] = T15 - T16;
Chris@82 360 }
Chris@82 361 }
Chris@82 362 }
Chris@82 363 }
Chris@82 364
Chris@82 365 static const kr2c_desc desc = { 20, "r2cb_20", {70, 14, 16, 0}, &GENUS };
Chris@82 366
Chris@82 367 void X(codelet_r2cb_20) (planner *p) {
Chris@82 368 X(kr2c_register) (p, r2cb_20, &desc);
Chris@82 369 }
Chris@82 370
Chris@82 371 #endif