annotate src/fftw-3.3.8/rdft/scalar/r2cf/hc2cfdft_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:10 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include rdft/scalar/hc2cf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 82 FP additions, 52 FP multiplications,
Chris@82 32 * (or, 60 additions, 30 multiplications, 22 fused multiply/add),
Chris@82 33 * 31 stack variables, 2 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cf.h"
Chris@82 36
Chris@82 37 static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT m;
Chris@82 43 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 44 E Ty, T14, TO, T1o, Tv, T16, TG, T1m, Ta, T19, TV, T1h, Tk, T1b, T11;
Chris@82 45 E T1j;
Chris@82 46 {
Chris@82 47 E Tw, Tx, TN, TI, TJ, TK;
Chris@82 48 Tw = Ip[0];
Chris@82 49 Tx = Im[0];
Chris@82 50 TN = Tw + Tx;
Chris@82 51 TI = Rm[0];
Chris@82 52 TJ = Rp[0];
Chris@82 53 TK = TI - TJ;
Chris@82 54 Ty = Tw - Tx;
Chris@82 55 T14 = TJ + TI;
Chris@82 56 {
Chris@82 57 E TH, TL, TM, T1n;
Chris@82 58 TH = W[0];
Chris@82 59 TL = TH * TK;
Chris@82 60 TM = W[1];
Chris@82 61 T1n = TM * TK;
Chris@82 62 TO = FNMS(TM, TN, TL);
Chris@82 63 T1o = FMA(TH, TN, T1n);
Chris@82 64 }
Chris@82 65 }
Chris@82 66 {
Chris@82 67 E Tp, TF, Tu, TC;
Chris@82 68 {
Chris@82 69 E Tn, To, Ts, Tt;
Chris@82 70 Tn = Ip[WS(rs, 2)];
Chris@82 71 To = Im[WS(rs, 2)];
Chris@82 72 Tp = Tn - To;
Chris@82 73 TF = Tn + To;
Chris@82 74 Ts = Rp[WS(rs, 2)];
Chris@82 75 Tt = Rm[WS(rs, 2)];
Chris@82 76 Tu = Ts + Tt;
Chris@82 77 TC = Tt - Ts;
Chris@82 78 }
Chris@82 79 {
Chris@82 80 E Tq, T15, Tm, Tr;
Chris@82 81 Tm = W[6];
Chris@82 82 Tq = Tm * Tp;
Chris@82 83 T15 = Tm * Tu;
Chris@82 84 Tr = W[7];
Chris@82 85 Tv = FNMS(Tr, Tu, Tq);
Chris@82 86 T16 = FMA(Tr, Tp, T15);
Chris@82 87 }
Chris@82 88 {
Chris@82 89 E TB, TD, TE, T1l;
Chris@82 90 TB = W[8];
Chris@82 91 TD = TB * TC;
Chris@82 92 TE = W[9];
Chris@82 93 T1l = TE * TC;
Chris@82 94 TG = FNMS(TE, TF, TD);
Chris@82 95 T1m = FMA(TB, TF, T1l);
Chris@82 96 }
Chris@82 97 }
Chris@82 98 {
Chris@82 99 E T4, TU, T9, TR;
Chris@82 100 {
Chris@82 101 E T2, T3, T7, T8;
Chris@82 102 T2 = Ip[WS(rs, 1)];
Chris@82 103 T3 = Im[WS(rs, 1)];
Chris@82 104 T4 = T2 - T3;
Chris@82 105 TU = T2 + T3;
Chris@82 106 T7 = Rp[WS(rs, 1)];
Chris@82 107 T8 = Rm[WS(rs, 1)];
Chris@82 108 T9 = T7 + T8;
Chris@82 109 TR = T7 - T8;
Chris@82 110 }
Chris@82 111 {
Chris@82 112 E T5, T18, T1, T6;
Chris@82 113 T1 = W[2];
Chris@82 114 T5 = T1 * T4;
Chris@82 115 T18 = T1 * T9;
Chris@82 116 T6 = W[3];
Chris@82 117 Ta = FNMS(T6, T9, T5);
Chris@82 118 T19 = FMA(T6, T4, T18);
Chris@82 119 }
Chris@82 120 {
Chris@82 121 E TS, T1g, TQ, TT;
Chris@82 122 TQ = W[4];
Chris@82 123 TS = TQ * TR;
Chris@82 124 T1g = TQ * TU;
Chris@82 125 TT = W[5];
Chris@82 126 TV = FMA(TT, TU, TS);
Chris@82 127 T1h = FNMS(TT, TR, T1g);
Chris@82 128 }
Chris@82 129 }
Chris@82 130 {
Chris@82 131 E Te, T10, Tj, TX;
Chris@82 132 {
Chris@82 133 E Tc, Td, Th, Ti;
Chris@82 134 Tc = Ip[WS(rs, 3)];
Chris@82 135 Td = Im[WS(rs, 3)];
Chris@82 136 Te = Tc - Td;
Chris@82 137 T10 = Tc + Td;
Chris@82 138 Th = Rp[WS(rs, 3)];
Chris@82 139 Ti = Rm[WS(rs, 3)];
Chris@82 140 Tj = Th + Ti;
Chris@82 141 TX = Th - Ti;
Chris@82 142 }
Chris@82 143 {
Chris@82 144 E Tf, T1a, Tb, Tg;
Chris@82 145 Tb = W[10];
Chris@82 146 Tf = Tb * Te;
Chris@82 147 T1a = Tb * Tj;
Chris@82 148 Tg = W[11];
Chris@82 149 Tk = FNMS(Tg, Tj, Tf);
Chris@82 150 T1b = FMA(Tg, Te, T1a);
Chris@82 151 }
Chris@82 152 {
Chris@82 153 E TY, T1i, TW, TZ;
Chris@82 154 TW = W[12];
Chris@82 155 TY = TW * TX;
Chris@82 156 T1i = TW * T10;
Chris@82 157 TZ = W[13];
Chris@82 158 T11 = FMA(TZ, T10, TY);
Chris@82 159 T1j = FNMS(TZ, TX, T1i);
Chris@82 160 }
Chris@82 161 }
Chris@82 162 {
Chris@82 163 E TA, T1f, T1q, T1s, T13, T1e, T1d, T1r;
Chris@82 164 {
Chris@82 165 E Tl, Tz, T1k, T1p;
Chris@82 166 Tl = Ta + Tk;
Chris@82 167 Tz = Tv + Ty;
Chris@82 168 TA = Tl + Tz;
Chris@82 169 T1f = Tz - Tl;
Chris@82 170 T1k = T1h + T1j;
Chris@82 171 T1p = T1m + T1o;
Chris@82 172 T1q = T1k - T1p;
Chris@82 173 T1s = T1k + T1p;
Chris@82 174 }
Chris@82 175 {
Chris@82 176 E TP, T12, T17, T1c;
Chris@82 177 TP = TG + TO;
Chris@82 178 T12 = TV + T11;
Chris@82 179 T13 = TP - T12;
Chris@82 180 T1e = T12 + TP;
Chris@82 181 T17 = T14 + T16;
Chris@82 182 T1c = T19 + T1b;
Chris@82 183 T1d = T17 - T1c;
Chris@82 184 T1r = T17 + T1c;
Chris@82 185 }
Chris@82 186 Ip[0] = KP500000000 * (TA + T13);
Chris@82 187 Rp[0] = KP500000000 * (T1r + T1s);
Chris@82 188 Im[WS(rs, 3)] = KP500000000 * (T13 - TA);
Chris@82 189 Rm[WS(rs, 3)] = KP500000000 * (T1r - T1s);
Chris@82 190 Rm[WS(rs, 1)] = KP500000000 * (T1d - T1e);
Chris@82 191 Im[WS(rs, 1)] = KP500000000 * (T1q - T1f);
Chris@82 192 Rp[WS(rs, 2)] = KP500000000 * (T1d + T1e);
Chris@82 193 Ip[WS(rs, 2)] = KP500000000 * (T1f + T1q);
Chris@82 194 }
Chris@82 195 {
Chris@82 196 E T1v, T1H, T1F, T1L, T1y, T1I, T1B, T1J;
Chris@82 197 {
Chris@82 198 E T1t, T1u, T1D, T1E;
Chris@82 199 T1t = Ty - Tv;
Chris@82 200 T1u = T19 - T1b;
Chris@82 201 T1v = T1t - T1u;
Chris@82 202 T1H = T1u + T1t;
Chris@82 203 T1D = T14 - T16;
Chris@82 204 T1E = Ta - Tk;
Chris@82 205 T1F = T1D - T1E;
Chris@82 206 T1L = T1D + T1E;
Chris@82 207 }
Chris@82 208 {
Chris@82 209 E T1w, T1x, T1z, T1A;
Chris@82 210 T1w = T1j - T1h;
Chris@82 211 T1x = TV - T11;
Chris@82 212 T1y = T1w + T1x;
Chris@82 213 T1I = T1w - T1x;
Chris@82 214 T1z = TO - TG;
Chris@82 215 T1A = T1o - T1m;
Chris@82 216 T1B = T1z - T1A;
Chris@82 217 T1J = T1z + T1A;
Chris@82 218 }
Chris@82 219 {
Chris@82 220 E T1C, T1M, T1G, T1K;
Chris@82 221 T1C = T1y + T1B;
Chris@82 222 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1C, T1v));
Chris@82 223 Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP707106781, T1C, T1v)));
Chris@82 224 T1M = T1I + T1J;
Chris@82 225 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP707106781, T1M, T1L));
Chris@82 226 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1M, T1L));
Chris@82 227 T1G = T1B - T1y;
Chris@82 228 Rm[0] = KP500000000 * (FNMS(KP707106781, T1G, T1F));
Chris@82 229 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1G, T1F));
Chris@82 230 T1K = T1I - T1J;
Chris@82 231 Ip[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1K, T1H));
Chris@82 232 Im[0] = -(KP500000000 * (FNMS(KP707106781, T1K, T1H)));
Chris@82 233 }
Chris@82 234 }
Chris@82 235 }
Chris@82 236 }
Chris@82 237 }
Chris@82 238
Chris@82 239 static const tw_instr twinstr[] = {
Chris@82 240 {TW_FULL, 1, 8},
Chris@82 241 {TW_NEXT, 1, 0}
Chris@82 242 };
Chris@82 243
Chris@82 244 static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, {60, 30, 22, 0} };
Chris@82 245
Chris@82 246 void X(codelet_hc2cfdft_8) (planner *p) {
Chris@82 247 X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
Chris@82 248 }
Chris@82 249 #else
Chris@82 250
Chris@82 251 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include rdft/scalar/hc2cf.h */
Chris@82 252
Chris@82 253 /*
Chris@82 254 * This function contains 82 FP additions, 44 FP multiplications,
Chris@82 255 * (or, 68 additions, 30 multiplications, 14 fused multiply/add),
Chris@82 256 * 39 stack variables, 2 constants, and 32 memory accesses
Chris@82 257 */
Chris@82 258 #include "rdft/scalar/hc2cf.h"
Chris@82 259
Chris@82 260 static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 261 {
Chris@82 262 DK(KP353553390, +0.353553390593273762200422181052424519642417969);
Chris@82 263 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 264 {
Chris@82 265 INT m;
Chris@82 266 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 267 E Tv, TX, Ts, TY, TE, T1a, TJ, T19, T1l, T1m, T9, T10, Ti, T11, TP;
Chris@82 268 E T16, TU, T17, T1i, T1j;
Chris@82 269 {
Chris@82 270 E Tt, Tu, TD, Tz, TA, TB, Tn, TI, Tr, TG, Tk, To;
Chris@82 271 Tt = Ip[0];
Chris@82 272 Tu = Im[0];
Chris@82 273 TD = Tt + Tu;
Chris@82 274 Tz = Rm[0];
Chris@82 275 TA = Rp[0];
Chris@82 276 TB = Tz - TA;
Chris@82 277 {
Chris@82 278 E Tl, Tm, Tp, Tq;
Chris@82 279 Tl = Ip[WS(rs, 2)];
Chris@82 280 Tm = Im[WS(rs, 2)];
Chris@82 281 Tn = Tl - Tm;
Chris@82 282 TI = Tl + Tm;
Chris@82 283 Tp = Rp[WS(rs, 2)];
Chris@82 284 Tq = Rm[WS(rs, 2)];
Chris@82 285 Tr = Tp + Tq;
Chris@82 286 TG = Tp - Tq;
Chris@82 287 }
Chris@82 288 Tv = Tt - Tu;
Chris@82 289 TX = TA + Tz;
Chris@82 290 Tk = W[6];
Chris@82 291 To = W[7];
Chris@82 292 Ts = FNMS(To, Tr, Tk * Tn);
Chris@82 293 TY = FMA(Tk, Tr, To * Tn);
Chris@82 294 {
Chris@82 295 E Ty, TC, TF, TH;
Chris@82 296 Ty = W[0];
Chris@82 297 TC = W[1];
Chris@82 298 TE = FNMS(TC, TD, Ty * TB);
Chris@82 299 T1a = FMA(TC, TB, Ty * TD);
Chris@82 300 TF = W[8];
Chris@82 301 TH = W[9];
Chris@82 302 TJ = FMA(TF, TG, TH * TI);
Chris@82 303 T19 = FNMS(TH, TG, TF * TI);
Chris@82 304 }
Chris@82 305 T1l = TJ + TE;
Chris@82 306 T1m = T1a - T19;
Chris@82 307 }
Chris@82 308 {
Chris@82 309 E T4, TO, T8, TM, Td, TT, Th, TR;
Chris@82 310 {
Chris@82 311 E T2, T3, T6, T7;
Chris@82 312 T2 = Ip[WS(rs, 1)];
Chris@82 313 T3 = Im[WS(rs, 1)];
Chris@82 314 T4 = T2 - T3;
Chris@82 315 TO = T2 + T3;
Chris@82 316 T6 = Rp[WS(rs, 1)];
Chris@82 317 T7 = Rm[WS(rs, 1)];
Chris@82 318 T8 = T6 + T7;
Chris@82 319 TM = T6 - T7;
Chris@82 320 }
Chris@82 321 {
Chris@82 322 E Tb, Tc, Tf, Tg;
Chris@82 323 Tb = Ip[WS(rs, 3)];
Chris@82 324 Tc = Im[WS(rs, 3)];
Chris@82 325 Td = Tb - Tc;
Chris@82 326 TT = Tb + Tc;
Chris@82 327 Tf = Rp[WS(rs, 3)];
Chris@82 328 Tg = Rm[WS(rs, 3)];
Chris@82 329 Th = Tf + Tg;
Chris@82 330 TR = Tf - Tg;
Chris@82 331 }
Chris@82 332 {
Chris@82 333 E T1, T5, Ta, Te;
Chris@82 334 T1 = W[2];
Chris@82 335 T5 = W[3];
Chris@82 336 T9 = FNMS(T5, T8, T1 * T4);
Chris@82 337 T10 = FMA(T1, T8, T5 * T4);
Chris@82 338 Ta = W[10];
Chris@82 339 Te = W[11];
Chris@82 340 Ti = FNMS(Te, Th, Ta * Td);
Chris@82 341 T11 = FMA(Ta, Th, Te * Td);
Chris@82 342 {
Chris@82 343 E TL, TN, TQ, TS;
Chris@82 344 TL = W[4];
Chris@82 345 TN = W[5];
Chris@82 346 TP = FMA(TL, TM, TN * TO);
Chris@82 347 T16 = FNMS(TN, TM, TL * TO);
Chris@82 348 TQ = W[12];
Chris@82 349 TS = W[13];
Chris@82 350 TU = FMA(TQ, TR, TS * TT);
Chris@82 351 T17 = FNMS(TS, TR, TQ * TT);
Chris@82 352 }
Chris@82 353 T1i = T17 - T16;
Chris@82 354 T1j = TP - TU;
Chris@82 355 }
Chris@82 356 }
Chris@82 357 {
Chris@82 358 E T1h, T1t, T1w, T1y, T1o, T1s, T1r, T1x;
Chris@82 359 {
Chris@82 360 E T1f, T1g, T1u, T1v;
Chris@82 361 T1f = Tv - Ts;
Chris@82 362 T1g = T10 - T11;
Chris@82 363 T1h = KP500000000 * (T1f - T1g);
Chris@82 364 T1t = KP500000000 * (T1g + T1f);
Chris@82 365 T1u = T1i - T1j;
Chris@82 366 T1v = T1l + T1m;
Chris@82 367 T1w = KP353553390 * (T1u - T1v);
Chris@82 368 T1y = KP353553390 * (T1u + T1v);
Chris@82 369 }
Chris@82 370 {
Chris@82 371 E T1k, T1n, T1p, T1q;
Chris@82 372 T1k = T1i + T1j;
Chris@82 373 T1n = T1l - T1m;
Chris@82 374 T1o = KP353553390 * (T1k + T1n);
Chris@82 375 T1s = KP353553390 * (T1n - T1k);
Chris@82 376 T1p = TX - TY;
Chris@82 377 T1q = T9 - Ti;
Chris@82 378 T1r = KP500000000 * (T1p - T1q);
Chris@82 379 T1x = KP500000000 * (T1p + T1q);
Chris@82 380 }
Chris@82 381 Ip[WS(rs, 1)] = T1h + T1o;
Chris@82 382 Rp[WS(rs, 1)] = T1x + T1y;
Chris@82 383 Im[WS(rs, 2)] = T1o - T1h;
Chris@82 384 Rm[WS(rs, 2)] = T1x - T1y;
Chris@82 385 Rm[0] = T1r - T1s;
Chris@82 386 Im[0] = T1w - T1t;
Chris@82 387 Rp[WS(rs, 3)] = T1r + T1s;
Chris@82 388 Ip[WS(rs, 3)] = T1t + T1w;
Chris@82 389 }
Chris@82 390 {
Chris@82 391 E Tx, T15, T1c, T1e, TW, T14, T13, T1d;
Chris@82 392 {
Chris@82 393 E Tj, Tw, T18, T1b;
Chris@82 394 Tj = T9 + Ti;
Chris@82 395 Tw = Ts + Tv;
Chris@82 396 Tx = Tj + Tw;
Chris@82 397 T15 = Tw - Tj;
Chris@82 398 T18 = T16 + T17;
Chris@82 399 T1b = T19 + T1a;
Chris@82 400 T1c = T18 - T1b;
Chris@82 401 T1e = T18 + T1b;
Chris@82 402 }
Chris@82 403 {
Chris@82 404 E TK, TV, TZ, T12;
Chris@82 405 TK = TE - TJ;
Chris@82 406 TV = TP + TU;
Chris@82 407 TW = TK - TV;
Chris@82 408 T14 = TV + TK;
Chris@82 409 TZ = TX + TY;
Chris@82 410 T12 = T10 + T11;
Chris@82 411 T13 = TZ - T12;
Chris@82 412 T1d = TZ + T12;
Chris@82 413 }
Chris@82 414 Ip[0] = KP500000000 * (Tx + TW);
Chris@82 415 Rp[0] = KP500000000 * (T1d + T1e);
Chris@82 416 Im[WS(rs, 3)] = KP500000000 * (TW - Tx);
Chris@82 417 Rm[WS(rs, 3)] = KP500000000 * (T1d - T1e);
Chris@82 418 Rm[WS(rs, 1)] = KP500000000 * (T13 - T14);
Chris@82 419 Im[WS(rs, 1)] = KP500000000 * (T1c - T15);
Chris@82 420 Rp[WS(rs, 2)] = KP500000000 * (T13 + T14);
Chris@82 421 Ip[WS(rs, 2)] = KP500000000 * (T15 + T1c);
Chris@82 422 }
Chris@82 423 }
Chris@82 424 }
Chris@82 425 }
Chris@82 426
Chris@82 427 static const tw_instr twinstr[] = {
Chris@82 428 {TW_FULL, 1, 8},
Chris@82 429 {TW_NEXT, 1, 0}
Chris@82 430 };
Chris@82 431
Chris@82 432 static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, {68, 30, 14, 0} };
Chris@82 433
Chris@82 434 void X(codelet_hc2cfdft_8) (planner *p) {
Chris@82 435 X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
Chris@82 436 }
Chris@82 437 #endif