annotate src/fftw-3.3.8/rdft/scalar/r2cb/hc2cbdft2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:08:00 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include rdft/scalar/hc2cb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 82 FP additions, 36 FP multiplications,
Chris@82 32 * (or, 60 additions, 14 multiplications, 22 fused multiply/add),
Chris@82 33 * 41 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cb.h"
Chris@82 36
Chris@82 37 static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 43 E Tl, T1p, T1g, TM, T1k, TE, TP, T1f, T7, Te, TU, TH, T1l, Tw, T1q;
Chris@82 44 E T1c, T1y;
Chris@82 45 {
Chris@82 46 E T3, TA, Tk, TN, T6, Th, TD, TO, Ta, Tm, Tp, TK, Td, Tr, Tu;
Chris@82 47 E TL, TF, TG;
Chris@82 48 {
Chris@82 49 E T1, T2, Ti, Tj;
Chris@82 50 T1 = Rp[0];
Chris@82 51 T2 = Rm[WS(rs, 3)];
Chris@82 52 T3 = T1 + T2;
Chris@82 53 TA = T1 - T2;
Chris@82 54 Ti = Ip[0];
Chris@82 55 Tj = Im[WS(rs, 3)];
Chris@82 56 Tk = Ti + Tj;
Chris@82 57 TN = Ti - Tj;
Chris@82 58 }
Chris@82 59 {
Chris@82 60 E T4, T5, TB, TC;
Chris@82 61 T4 = Rp[WS(rs, 2)];
Chris@82 62 T5 = Rm[WS(rs, 1)];
Chris@82 63 T6 = T4 + T5;
Chris@82 64 Th = T4 - T5;
Chris@82 65 TB = Ip[WS(rs, 2)];
Chris@82 66 TC = Im[WS(rs, 1)];
Chris@82 67 TD = TB + TC;
Chris@82 68 TO = TB - TC;
Chris@82 69 }
Chris@82 70 {
Chris@82 71 E T8, T9, Tn, To;
Chris@82 72 T8 = Rp[WS(rs, 1)];
Chris@82 73 T9 = Rm[WS(rs, 2)];
Chris@82 74 Ta = T8 + T9;
Chris@82 75 Tm = T8 - T9;
Chris@82 76 Tn = Ip[WS(rs, 1)];
Chris@82 77 To = Im[WS(rs, 2)];
Chris@82 78 Tp = Tn + To;
Chris@82 79 TK = Tn - To;
Chris@82 80 }
Chris@82 81 {
Chris@82 82 E Tb, Tc, Ts, Tt;
Chris@82 83 Tb = Rm[0];
Chris@82 84 Tc = Rp[WS(rs, 3)];
Chris@82 85 Td = Tb + Tc;
Chris@82 86 Tr = Tb - Tc;
Chris@82 87 Ts = Im[0];
Chris@82 88 Tt = Ip[WS(rs, 3)];
Chris@82 89 Tu = Ts + Tt;
Chris@82 90 TL = Tt - Ts;
Chris@82 91 }
Chris@82 92 Tl = Th + Tk;
Chris@82 93 T1p = TA + TD;
Chris@82 94 T1g = TN - TO;
Chris@82 95 TM = TK + TL;
Chris@82 96 T1k = Tk - Th;
Chris@82 97 TE = TA - TD;
Chris@82 98 TP = TN + TO;
Chris@82 99 T1f = Ta - Td;
Chris@82 100 T7 = T3 + T6;
Chris@82 101 Te = Ta + Td;
Chris@82 102 TU = T7 - Te;
Chris@82 103 TF = Tm - Tp;
Chris@82 104 TG = Tr - Tu;
Chris@82 105 TH = TF + TG;
Chris@82 106 T1l = TF - TG;
Chris@82 107 {
Chris@82 108 E Tq, Tv, T1a, T1b;
Chris@82 109 Tq = Tm + Tp;
Chris@82 110 Tv = Tr + Tu;
Chris@82 111 Tw = Tq - Tv;
Chris@82 112 T1q = Tq + Tv;
Chris@82 113 T1a = T3 - T6;
Chris@82 114 T1b = TL - TK;
Chris@82 115 T1c = T1a + T1b;
Chris@82 116 T1y = T1a - T1b;
Chris@82 117 }
Chris@82 118 }
Chris@82 119 {
Chris@82 120 E Tf, TQ, Tx, TI, Ty, TR, Tg, TJ, TS, Tz;
Chris@82 121 Tf = T7 + Te;
Chris@82 122 TQ = TM + TP;
Chris@82 123 Tx = FMA(KP707106781, Tw, Tl);
Chris@82 124 TI = FMA(KP707106781, TH, TE);
Chris@82 125 Tg = W[0];
Chris@82 126 Ty = Tg * Tx;
Chris@82 127 TR = Tg * TI;
Chris@82 128 Tz = W[1];
Chris@82 129 TJ = FMA(Tz, TI, Ty);
Chris@82 130 TS = FNMS(Tz, Tx, TR);
Chris@82 131 Rp[0] = Tf - TJ;
Chris@82 132 Ip[0] = TQ + TS;
Chris@82 133 Rm[0] = Tf + TJ;
Chris@82 134 Im[0] = TS - TQ;
Chris@82 135 }
Chris@82 136 {
Chris@82 137 E T1B, T1A, T1J, T1x, T1z, T1E, T1H, T1F, T1L, T1D;
Chris@82 138 T1B = T1g - T1f;
Chris@82 139 T1A = W[11];
Chris@82 140 T1J = T1A * T1y;
Chris@82 141 T1x = W[10];
Chris@82 142 T1z = T1x * T1y;
Chris@82 143 T1E = FNMS(KP707106781, T1l, T1k);
Chris@82 144 T1H = FMA(KP707106781, T1q, T1p);
Chris@82 145 T1D = W[12];
Chris@82 146 T1F = T1D * T1E;
Chris@82 147 T1L = T1D * T1H;
Chris@82 148 {
Chris@82 149 E T1C, T1K, T1I, T1M, T1G;
Chris@82 150 T1C = FNMS(T1A, T1B, T1z);
Chris@82 151 T1K = FMA(T1x, T1B, T1J);
Chris@82 152 T1G = W[13];
Chris@82 153 T1I = FMA(T1G, T1H, T1F);
Chris@82 154 T1M = FNMS(T1G, T1E, T1L);
Chris@82 155 Rp[WS(rs, 3)] = T1C - T1I;
Chris@82 156 Ip[WS(rs, 3)] = T1K + T1M;
Chris@82 157 Rm[WS(rs, 3)] = T1C + T1I;
Chris@82 158 Im[WS(rs, 3)] = T1M - T1K;
Chris@82 159 }
Chris@82 160 }
Chris@82 161 {
Chris@82 162 E TX, TW, T15, TT, TV, T10, T13, T11, T17, TZ;
Chris@82 163 TX = TP - TM;
Chris@82 164 TW = W[7];
Chris@82 165 T15 = TW * TU;
Chris@82 166 TT = W[6];
Chris@82 167 TV = TT * TU;
Chris@82 168 T10 = FNMS(KP707106781, Tw, Tl);
Chris@82 169 T13 = FNMS(KP707106781, TH, TE);
Chris@82 170 TZ = W[8];
Chris@82 171 T11 = TZ * T10;
Chris@82 172 T17 = TZ * T13;
Chris@82 173 {
Chris@82 174 E TY, T16, T14, T18, T12;
Chris@82 175 TY = FNMS(TW, TX, TV);
Chris@82 176 T16 = FMA(TT, TX, T15);
Chris@82 177 T12 = W[9];
Chris@82 178 T14 = FMA(T12, T13, T11);
Chris@82 179 T18 = FNMS(T12, T10, T17);
Chris@82 180 Rp[WS(rs, 2)] = TY - T14;
Chris@82 181 Ip[WS(rs, 2)] = T16 + T18;
Chris@82 182 Rm[WS(rs, 2)] = TY + T14;
Chris@82 183 Im[WS(rs, 2)] = T18 - T16;
Chris@82 184 }
Chris@82 185 }
Chris@82 186 {
Chris@82 187 E T1h, T1e, T1t, T19, T1d, T1m, T1r, T1n, T1v, T1j;
Chris@82 188 T1h = T1f + T1g;
Chris@82 189 T1e = W[3];
Chris@82 190 T1t = T1e * T1c;
Chris@82 191 T19 = W[2];
Chris@82 192 T1d = T19 * T1c;
Chris@82 193 T1m = FMA(KP707106781, T1l, T1k);
Chris@82 194 T1r = FNMS(KP707106781, T1q, T1p);
Chris@82 195 T1j = W[4];
Chris@82 196 T1n = T1j * T1m;
Chris@82 197 T1v = T1j * T1r;
Chris@82 198 {
Chris@82 199 E T1i, T1u, T1s, T1w, T1o;
Chris@82 200 T1i = FNMS(T1e, T1h, T1d);
Chris@82 201 T1u = FMA(T19, T1h, T1t);
Chris@82 202 T1o = W[5];
Chris@82 203 T1s = FMA(T1o, T1r, T1n);
Chris@82 204 T1w = FNMS(T1o, T1m, T1v);
Chris@82 205 Rp[WS(rs, 1)] = T1i - T1s;
Chris@82 206 Ip[WS(rs, 1)] = T1u + T1w;
Chris@82 207 Rm[WS(rs, 1)] = T1i + T1s;
Chris@82 208 Im[WS(rs, 1)] = T1w - T1u;
Chris@82 209 }
Chris@82 210 }
Chris@82 211 }
Chris@82 212 }
Chris@82 213 }
Chris@82 214
Chris@82 215 static const tw_instr twinstr[] = {
Chris@82 216 {TW_FULL, 1, 8},
Chris@82 217 {TW_NEXT, 1, 0}
Chris@82 218 };
Chris@82 219
Chris@82 220 static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, {60, 14, 22, 0} };
Chris@82 221
Chris@82 222 void X(codelet_hc2cbdft2_8) (planner *p) {
Chris@82 223 X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
Chris@82 224 }
Chris@82 225 #else
Chris@82 226
Chris@82 227 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include rdft/scalar/hc2cb.h */
Chris@82 228
Chris@82 229 /*
Chris@82 230 * This function contains 82 FP additions, 32 FP multiplications,
Chris@82 231 * (or, 68 additions, 18 multiplications, 14 fused multiply/add),
Chris@82 232 * 30 stack variables, 1 constants, and 32 memory accesses
Chris@82 233 */
Chris@82 234 #include "rdft/scalar/hc2cb.h"
Chris@82 235
Chris@82 236 static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 237 {
Chris@82 238 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 239 {
Chris@82 240 INT m;
Chris@82 241 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 242 E T7, T1d, T1h, Tl, TG, T14, T19, TO, Te, TL, T18, T15, TB, T1e, Tw;
Chris@82 243 E T1i;
Chris@82 244 {
Chris@82 245 E T3, TC, Tk, TM, T6, Th, TF, TN;
Chris@82 246 {
Chris@82 247 E T1, T2, Ti, Tj;
Chris@82 248 T1 = Rp[0];
Chris@82 249 T2 = Rm[WS(rs, 3)];
Chris@82 250 T3 = T1 + T2;
Chris@82 251 TC = T1 - T2;
Chris@82 252 Ti = Ip[0];
Chris@82 253 Tj = Im[WS(rs, 3)];
Chris@82 254 Tk = Ti + Tj;
Chris@82 255 TM = Ti - Tj;
Chris@82 256 }
Chris@82 257 {
Chris@82 258 E T4, T5, TD, TE;
Chris@82 259 T4 = Rp[WS(rs, 2)];
Chris@82 260 T5 = Rm[WS(rs, 1)];
Chris@82 261 T6 = T4 + T5;
Chris@82 262 Th = T4 - T5;
Chris@82 263 TD = Ip[WS(rs, 2)];
Chris@82 264 TE = Im[WS(rs, 1)];
Chris@82 265 TF = TD + TE;
Chris@82 266 TN = TD - TE;
Chris@82 267 }
Chris@82 268 T7 = T3 + T6;
Chris@82 269 T1d = Tk - Th;
Chris@82 270 T1h = TC + TF;
Chris@82 271 Tl = Th + Tk;
Chris@82 272 TG = TC - TF;
Chris@82 273 T14 = T3 - T6;
Chris@82 274 T19 = TM - TN;
Chris@82 275 TO = TM + TN;
Chris@82 276 }
Chris@82 277 {
Chris@82 278 E Ta, Tm, Tp, TJ, Td, Tr, Tu, TK;
Chris@82 279 {
Chris@82 280 E T8, T9, Tn, To;
Chris@82 281 T8 = Rp[WS(rs, 1)];
Chris@82 282 T9 = Rm[WS(rs, 2)];
Chris@82 283 Ta = T8 + T9;
Chris@82 284 Tm = T8 - T9;
Chris@82 285 Tn = Ip[WS(rs, 1)];
Chris@82 286 To = Im[WS(rs, 2)];
Chris@82 287 Tp = Tn + To;
Chris@82 288 TJ = Tn - To;
Chris@82 289 }
Chris@82 290 {
Chris@82 291 E Tb, Tc, Ts, Tt;
Chris@82 292 Tb = Rm[0];
Chris@82 293 Tc = Rp[WS(rs, 3)];
Chris@82 294 Td = Tb + Tc;
Chris@82 295 Tr = Tb - Tc;
Chris@82 296 Ts = Im[0];
Chris@82 297 Tt = Ip[WS(rs, 3)];
Chris@82 298 Tu = Ts + Tt;
Chris@82 299 TK = Tt - Ts;
Chris@82 300 }
Chris@82 301 Te = Ta + Td;
Chris@82 302 TL = TJ + TK;
Chris@82 303 T18 = Ta - Td;
Chris@82 304 T15 = TK - TJ;
Chris@82 305 {
Chris@82 306 E Tz, TA, Tq, Tv;
Chris@82 307 Tz = Tm - Tp;
Chris@82 308 TA = Tr - Tu;
Chris@82 309 TB = KP707106781 * (Tz + TA);
Chris@82 310 T1e = KP707106781 * (Tz - TA);
Chris@82 311 Tq = Tm + Tp;
Chris@82 312 Tv = Tr + Tu;
Chris@82 313 Tw = KP707106781 * (Tq - Tv);
Chris@82 314 T1i = KP707106781 * (Tq + Tv);
Chris@82 315 }
Chris@82 316 }
Chris@82 317 {
Chris@82 318 E Tf, TP, TI, TQ;
Chris@82 319 Tf = T7 + Te;
Chris@82 320 TP = TL + TO;
Chris@82 321 {
Chris@82 322 E Tx, TH, Tg, Ty;
Chris@82 323 Tx = Tl + Tw;
Chris@82 324 TH = TB + TG;
Chris@82 325 Tg = W[0];
Chris@82 326 Ty = W[1];
Chris@82 327 TI = FMA(Tg, Tx, Ty * TH);
Chris@82 328 TQ = FNMS(Ty, Tx, Tg * TH);
Chris@82 329 }
Chris@82 330 Rp[0] = Tf - TI;
Chris@82 331 Ip[0] = TP + TQ;
Chris@82 332 Rm[0] = Tf + TI;
Chris@82 333 Im[0] = TQ - TP;
Chris@82 334 }
Chris@82 335 {
Chris@82 336 E T1r, T1x, T1w, T1y;
Chris@82 337 {
Chris@82 338 E T1o, T1q, T1n, T1p;
Chris@82 339 T1o = T14 - T15;
Chris@82 340 T1q = T19 - T18;
Chris@82 341 T1n = W[10];
Chris@82 342 T1p = W[11];
Chris@82 343 T1r = FNMS(T1p, T1q, T1n * T1o);
Chris@82 344 T1x = FMA(T1p, T1o, T1n * T1q);
Chris@82 345 }
Chris@82 346 {
Chris@82 347 E T1t, T1v, T1s, T1u;
Chris@82 348 T1t = T1d - T1e;
Chris@82 349 T1v = T1i + T1h;
Chris@82 350 T1s = W[12];
Chris@82 351 T1u = W[13];
Chris@82 352 T1w = FMA(T1s, T1t, T1u * T1v);
Chris@82 353 T1y = FNMS(T1u, T1t, T1s * T1v);
Chris@82 354 }
Chris@82 355 Rp[WS(rs, 3)] = T1r - T1w;
Chris@82 356 Ip[WS(rs, 3)] = T1x + T1y;
Chris@82 357 Rm[WS(rs, 3)] = T1r + T1w;
Chris@82 358 Im[WS(rs, 3)] = T1y - T1x;
Chris@82 359 }
Chris@82 360 {
Chris@82 361 E TV, T11, T10, T12;
Chris@82 362 {
Chris@82 363 E TS, TU, TR, TT;
Chris@82 364 TS = T7 - Te;
Chris@82 365 TU = TO - TL;
Chris@82 366 TR = W[6];
Chris@82 367 TT = W[7];
Chris@82 368 TV = FNMS(TT, TU, TR * TS);
Chris@82 369 T11 = FMA(TT, TS, TR * TU);
Chris@82 370 }
Chris@82 371 {
Chris@82 372 E TX, TZ, TW, TY;
Chris@82 373 TX = Tl - Tw;
Chris@82 374 TZ = TG - TB;
Chris@82 375 TW = W[8];
Chris@82 376 TY = W[9];
Chris@82 377 T10 = FMA(TW, TX, TY * TZ);
Chris@82 378 T12 = FNMS(TY, TX, TW * TZ);
Chris@82 379 }
Chris@82 380 Rp[WS(rs, 2)] = TV - T10;
Chris@82 381 Ip[WS(rs, 2)] = T11 + T12;
Chris@82 382 Rm[WS(rs, 2)] = TV + T10;
Chris@82 383 Im[WS(rs, 2)] = T12 - T11;
Chris@82 384 }
Chris@82 385 {
Chris@82 386 E T1b, T1l, T1k, T1m;
Chris@82 387 {
Chris@82 388 E T16, T1a, T13, T17;
Chris@82 389 T16 = T14 + T15;
Chris@82 390 T1a = T18 + T19;
Chris@82 391 T13 = W[2];
Chris@82 392 T17 = W[3];
Chris@82 393 T1b = FNMS(T17, T1a, T13 * T16);
Chris@82 394 T1l = FMA(T17, T16, T13 * T1a);
Chris@82 395 }
Chris@82 396 {
Chris@82 397 E T1f, T1j, T1c, T1g;
Chris@82 398 T1f = T1d + T1e;
Chris@82 399 T1j = T1h - T1i;
Chris@82 400 T1c = W[4];
Chris@82 401 T1g = W[5];
Chris@82 402 T1k = FMA(T1c, T1f, T1g * T1j);
Chris@82 403 T1m = FNMS(T1g, T1f, T1c * T1j);
Chris@82 404 }
Chris@82 405 Rp[WS(rs, 1)] = T1b - T1k;
Chris@82 406 Ip[WS(rs, 1)] = T1l + T1m;
Chris@82 407 Rm[WS(rs, 1)] = T1b + T1k;
Chris@82 408 Im[WS(rs, 1)] = T1m - T1l;
Chris@82 409 }
Chris@82 410 }
Chris@82 411 }
Chris@82 412 }
Chris@82 413
Chris@82 414 static const tw_instr twinstr[] = {
Chris@82 415 {TW_FULL, 1, 8},
Chris@82 416 {TW_NEXT, 1, 0}
Chris@82 417 };
Chris@82 418
Chris@82 419 static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, {68, 18, 14, 0} };
Chris@82 420
Chris@82 421 void X(codelet_hc2cbdft2_8) (planner *p) {
Chris@82 422 X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
Chris@82 423 }
Chris@82 424 #endif