annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cbdft2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:52:05 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 82 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 60 additions, 14 multiplications, 22 fused multiply/add),
Chris@42 33 * 55 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 43 E T1m, T1r, T1i, T1u, T1o, T1v, T1n, T1w, T1s;
Chris@42 44 {
Chris@42 45 E T1k, Tl, T1p, TE, TP, T1g, TM, T1b, T1f, T1a, TU, Tf, T1l, TH, Tw;
Chris@42 46 E T1q;
Chris@42 47 {
Chris@42 48 E TA, T3, TN, Tk, Th, T6, TO, TD, Tb, Tm, Ta, TK, Tp, Tc, Ts;
Chris@42 49 E Tt;
Chris@42 50 {
Chris@42 51 E T4, T5, TB, TC;
Chris@42 52 {
Chris@42 53 E T1, T2, Ti, Tj;
Chris@42 54 T1 = Rp[0];
Chris@42 55 T2 = Rm[WS(rs, 3)];
Chris@42 56 Ti = Ip[0];
Chris@42 57 Tj = Im[WS(rs, 3)];
Chris@42 58 T4 = Rp[WS(rs, 2)];
Chris@42 59 TA = T1 - T2;
Chris@42 60 T3 = T1 + T2;
Chris@42 61 TN = Ti - Tj;
Chris@42 62 Tk = Ti + Tj;
Chris@42 63 T5 = Rm[WS(rs, 1)];
Chris@42 64 TB = Ip[WS(rs, 2)];
Chris@42 65 TC = Im[WS(rs, 1)];
Chris@42 66 }
Chris@42 67 {
Chris@42 68 E T8, T9, Tn, To;
Chris@42 69 T8 = Rp[WS(rs, 1)];
Chris@42 70 Th = T4 - T5;
Chris@42 71 T6 = T4 + T5;
Chris@42 72 TO = TB - TC;
Chris@42 73 TD = TB + TC;
Chris@42 74 T9 = Rm[WS(rs, 2)];
Chris@42 75 Tn = Ip[WS(rs, 1)];
Chris@42 76 To = Im[WS(rs, 2)];
Chris@42 77 Tb = Rm[0];
Chris@42 78 Tm = T8 - T9;
Chris@42 79 Ta = T8 + T9;
Chris@42 80 TK = Tn - To;
Chris@42 81 Tp = Tn + To;
Chris@42 82 Tc = Rp[WS(rs, 3)];
Chris@42 83 Ts = Im[0];
Chris@42 84 Tt = Ip[WS(rs, 3)];
Chris@42 85 }
Chris@42 86 }
Chris@42 87 {
Chris@42 88 E Tr, Td, Tu, TL, Te, T7;
Chris@42 89 T1k = Tk - Th;
Chris@42 90 Tl = Th + Tk;
Chris@42 91 Tr = Tb - Tc;
Chris@42 92 Td = Tb + Tc;
Chris@42 93 TL = Tt - Ts;
Chris@42 94 Tu = Ts + Tt;
Chris@42 95 T1p = TA + TD;
Chris@42 96 TE = TA - TD;
Chris@42 97 TP = TN + TO;
Chris@42 98 T1g = TN - TO;
Chris@42 99 TM = TK + TL;
Chris@42 100 T1b = TL - TK;
Chris@42 101 T1f = Ta - Td;
Chris@42 102 Te = Ta + Td;
Chris@42 103 T1a = T3 - T6;
Chris@42 104 T7 = T3 + T6;
Chris@42 105 {
Chris@42 106 E Tq, TF, TG, Tv;
Chris@42 107 Tq = Tm + Tp;
Chris@42 108 TF = Tm - Tp;
Chris@42 109 TG = Tr - Tu;
Chris@42 110 Tv = Tr + Tu;
Chris@42 111 TU = T7 - Te;
Chris@42 112 Tf = T7 + Te;
Chris@42 113 T1l = TF - TG;
Chris@42 114 TH = TF + TG;
Chris@42 115 Tw = Tq - Tv;
Chris@42 116 T1q = Tq + Tv;
Chris@42 117 }
Chris@42 118 }
Chris@42 119 }
Chris@42 120 {
Chris@42 121 E TX, T10, T1c, T13, T1h, T1E, T1H, T1C, T1K, T1G, T1L, T1F;
Chris@42 122 {
Chris@42 123 E TQ, Tx, T1y, TI, Tg, Tz;
Chris@42 124 TX = TP - TM;
Chris@42 125 TQ = TM + TP;
Chris@42 126 Tx = FMA(KP707106781, Tw, Tl);
Chris@42 127 T10 = FNMS(KP707106781, Tw, Tl);
Chris@42 128 T1c = T1a + T1b;
Chris@42 129 T1y = T1a - T1b;
Chris@42 130 T13 = FNMS(KP707106781, TH, TE);
Chris@42 131 TI = FMA(KP707106781, TH, TE);
Chris@42 132 Tg = W[0];
Chris@42 133 Tz = W[1];
Chris@42 134 {
Chris@42 135 E T1B, T1A, T1x, T1J, T1z, T1D;
Chris@42 136 {
Chris@42 137 E TR, Ty, TS, TJ;
Chris@42 138 T1B = T1g - T1f;
Chris@42 139 T1h = T1f + T1g;
Chris@42 140 T1A = W[11];
Chris@42 141 TR = Tg * TI;
Chris@42 142 Ty = Tg * Tx;
Chris@42 143 T1x = W[10];
Chris@42 144 T1J = T1A * T1y;
Chris@42 145 TS = FNMS(Tz, Tx, TR);
Chris@42 146 TJ = FMA(Tz, TI, Ty);
Chris@42 147 T1z = T1x * T1y;
Chris@42 148 T1m = FMA(KP707106781, T1l, T1k);
Chris@42 149 T1E = FNMS(KP707106781, T1l, T1k);
Chris@42 150 Im[0] = TS - TQ;
Chris@42 151 Ip[0] = TQ + TS;
Chris@42 152 Rm[0] = Tf + TJ;
Chris@42 153 Rp[0] = Tf - TJ;
Chris@42 154 T1H = FMA(KP707106781, T1q, T1p);
Chris@42 155 T1r = FNMS(KP707106781, T1q, T1p);
Chris@42 156 T1D = W[12];
Chris@42 157 }
Chris@42 158 T1C = FNMS(T1A, T1B, T1z);
Chris@42 159 T1K = FMA(T1x, T1B, T1J);
Chris@42 160 T1G = W[13];
Chris@42 161 T1L = T1D * T1H;
Chris@42 162 T1F = T1D * T1E;
Chris@42 163 }
Chris@42 164 }
Chris@42 165 {
Chris@42 166 E TY, T16, T12, T17, T11;
Chris@42 167 {
Chris@42 168 E TW, TT, T15, TV, TZ, T1M, T1I;
Chris@42 169 TW = W[7];
Chris@42 170 T1M = FNMS(T1G, T1E, T1L);
Chris@42 171 T1I = FMA(T1G, T1H, T1F);
Chris@42 172 TT = W[6];
Chris@42 173 T15 = TW * TU;
Chris@42 174 Im[WS(rs, 3)] = T1M - T1K;
Chris@42 175 Ip[WS(rs, 3)] = T1K + T1M;
Chris@42 176 Rm[WS(rs, 3)] = T1C + T1I;
Chris@42 177 Rp[WS(rs, 3)] = T1C - T1I;
Chris@42 178 TV = TT * TU;
Chris@42 179 TZ = W[8];
Chris@42 180 TY = FNMS(TW, TX, TV);
Chris@42 181 T16 = FMA(TT, TX, T15);
Chris@42 182 T12 = W[9];
Chris@42 183 T17 = TZ * T13;
Chris@42 184 T11 = TZ * T10;
Chris@42 185 }
Chris@42 186 {
Chris@42 187 E T1e, T19, T1t, T1d, T1j, T18, T14;
Chris@42 188 T1e = W[3];
Chris@42 189 T18 = FNMS(T12, T10, T17);
Chris@42 190 T14 = FMA(T12, T13, T11);
Chris@42 191 T19 = W[2];
Chris@42 192 T1t = T1e * T1c;
Chris@42 193 Im[WS(rs, 2)] = T18 - T16;
Chris@42 194 Ip[WS(rs, 2)] = T16 + T18;
Chris@42 195 Rm[WS(rs, 2)] = TY + T14;
Chris@42 196 Rp[WS(rs, 2)] = TY - T14;
Chris@42 197 T1d = T19 * T1c;
Chris@42 198 T1j = W[4];
Chris@42 199 T1i = FNMS(T1e, T1h, T1d);
Chris@42 200 T1u = FMA(T19, T1h, T1t);
Chris@42 201 T1o = W[5];
Chris@42 202 T1v = T1j * T1r;
Chris@42 203 T1n = T1j * T1m;
Chris@42 204 }
Chris@42 205 }
Chris@42 206 }
Chris@42 207 }
Chris@42 208 T1w = FNMS(T1o, T1m, T1v);
Chris@42 209 T1s = FMA(T1o, T1r, T1n);
Chris@42 210 Im[WS(rs, 1)] = T1w - T1u;
Chris@42 211 Ip[WS(rs, 1)] = T1u + T1w;
Chris@42 212 Rm[WS(rs, 1)] = T1i + T1s;
Chris@42 213 Rp[WS(rs, 1)] = T1i - T1s;
Chris@42 214 }
Chris@42 215 }
Chris@42 216 }
Chris@42 217
Chris@42 218 static const tw_instr twinstr[] = {
Chris@42 219 {TW_FULL, 1, 8},
Chris@42 220 {TW_NEXT, 1, 0}
Chris@42 221 };
Chris@42 222
Chris@42 223 static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, {60, 14, 22, 0} };
Chris@42 224
Chris@42 225 void X(codelet_hc2cbdft2_8) (planner *p) {
Chris@42 226 X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
Chris@42 227 }
Chris@42 228 #else /* HAVE_FMA */
Chris@42 229
Chris@42 230 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cbdft2_8 -include hc2cb.h */
Chris@42 231
Chris@42 232 /*
Chris@42 233 * This function contains 82 FP additions, 32 FP multiplications,
Chris@42 234 * (or, 68 additions, 18 multiplications, 14 fused multiply/add),
Chris@42 235 * 30 stack variables, 1 constants, and 32 memory accesses
Chris@42 236 */
Chris@42 237 #include "hc2cb.h"
Chris@42 238
Chris@42 239 static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 240 {
Chris@42 241 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 242 {
Chris@42 243 INT m;
Chris@42 244 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 245 E T7, T1d, T1h, Tl, TG, T14, T19, TO, Te, TL, T18, T15, TB, T1e, Tw;
Chris@42 246 E T1i;
Chris@42 247 {
Chris@42 248 E T3, TC, Tk, TM, T6, Th, TF, TN;
Chris@42 249 {
Chris@42 250 E T1, T2, Ti, Tj;
Chris@42 251 T1 = Rp[0];
Chris@42 252 T2 = Rm[WS(rs, 3)];
Chris@42 253 T3 = T1 + T2;
Chris@42 254 TC = T1 - T2;
Chris@42 255 Ti = Ip[0];
Chris@42 256 Tj = Im[WS(rs, 3)];
Chris@42 257 Tk = Ti + Tj;
Chris@42 258 TM = Ti - Tj;
Chris@42 259 }
Chris@42 260 {
Chris@42 261 E T4, T5, TD, TE;
Chris@42 262 T4 = Rp[WS(rs, 2)];
Chris@42 263 T5 = Rm[WS(rs, 1)];
Chris@42 264 T6 = T4 + T5;
Chris@42 265 Th = T4 - T5;
Chris@42 266 TD = Ip[WS(rs, 2)];
Chris@42 267 TE = Im[WS(rs, 1)];
Chris@42 268 TF = TD + TE;
Chris@42 269 TN = TD - TE;
Chris@42 270 }
Chris@42 271 T7 = T3 + T6;
Chris@42 272 T1d = Tk - Th;
Chris@42 273 T1h = TC + TF;
Chris@42 274 Tl = Th + Tk;
Chris@42 275 TG = TC - TF;
Chris@42 276 T14 = T3 - T6;
Chris@42 277 T19 = TM - TN;
Chris@42 278 TO = TM + TN;
Chris@42 279 }
Chris@42 280 {
Chris@42 281 E Ta, Tm, Tp, TJ, Td, Tr, Tu, TK;
Chris@42 282 {
Chris@42 283 E T8, T9, Tn, To;
Chris@42 284 T8 = Rp[WS(rs, 1)];
Chris@42 285 T9 = Rm[WS(rs, 2)];
Chris@42 286 Ta = T8 + T9;
Chris@42 287 Tm = T8 - T9;
Chris@42 288 Tn = Ip[WS(rs, 1)];
Chris@42 289 To = Im[WS(rs, 2)];
Chris@42 290 Tp = Tn + To;
Chris@42 291 TJ = Tn - To;
Chris@42 292 }
Chris@42 293 {
Chris@42 294 E Tb, Tc, Ts, Tt;
Chris@42 295 Tb = Rm[0];
Chris@42 296 Tc = Rp[WS(rs, 3)];
Chris@42 297 Td = Tb + Tc;
Chris@42 298 Tr = Tb - Tc;
Chris@42 299 Ts = Im[0];
Chris@42 300 Tt = Ip[WS(rs, 3)];
Chris@42 301 Tu = Ts + Tt;
Chris@42 302 TK = Tt - Ts;
Chris@42 303 }
Chris@42 304 Te = Ta + Td;
Chris@42 305 TL = TJ + TK;
Chris@42 306 T18 = Ta - Td;
Chris@42 307 T15 = TK - TJ;
Chris@42 308 {
Chris@42 309 E Tz, TA, Tq, Tv;
Chris@42 310 Tz = Tm - Tp;
Chris@42 311 TA = Tr - Tu;
Chris@42 312 TB = KP707106781 * (Tz + TA);
Chris@42 313 T1e = KP707106781 * (Tz - TA);
Chris@42 314 Tq = Tm + Tp;
Chris@42 315 Tv = Tr + Tu;
Chris@42 316 Tw = KP707106781 * (Tq - Tv);
Chris@42 317 T1i = KP707106781 * (Tq + Tv);
Chris@42 318 }
Chris@42 319 }
Chris@42 320 {
Chris@42 321 E Tf, TP, TI, TQ;
Chris@42 322 Tf = T7 + Te;
Chris@42 323 TP = TL + TO;
Chris@42 324 {
Chris@42 325 E Tx, TH, Tg, Ty;
Chris@42 326 Tx = Tl + Tw;
Chris@42 327 TH = TB + TG;
Chris@42 328 Tg = W[0];
Chris@42 329 Ty = W[1];
Chris@42 330 TI = FMA(Tg, Tx, Ty * TH);
Chris@42 331 TQ = FNMS(Ty, Tx, Tg * TH);
Chris@42 332 }
Chris@42 333 Rp[0] = Tf - TI;
Chris@42 334 Ip[0] = TP + TQ;
Chris@42 335 Rm[0] = Tf + TI;
Chris@42 336 Im[0] = TQ - TP;
Chris@42 337 }
Chris@42 338 {
Chris@42 339 E T1r, T1x, T1w, T1y;
Chris@42 340 {
Chris@42 341 E T1o, T1q, T1n, T1p;
Chris@42 342 T1o = T14 - T15;
Chris@42 343 T1q = T19 - T18;
Chris@42 344 T1n = W[10];
Chris@42 345 T1p = W[11];
Chris@42 346 T1r = FNMS(T1p, T1q, T1n * T1o);
Chris@42 347 T1x = FMA(T1p, T1o, T1n * T1q);
Chris@42 348 }
Chris@42 349 {
Chris@42 350 E T1t, T1v, T1s, T1u;
Chris@42 351 T1t = T1d - T1e;
Chris@42 352 T1v = T1i + T1h;
Chris@42 353 T1s = W[12];
Chris@42 354 T1u = W[13];
Chris@42 355 T1w = FMA(T1s, T1t, T1u * T1v);
Chris@42 356 T1y = FNMS(T1u, T1t, T1s * T1v);
Chris@42 357 }
Chris@42 358 Rp[WS(rs, 3)] = T1r - T1w;
Chris@42 359 Ip[WS(rs, 3)] = T1x + T1y;
Chris@42 360 Rm[WS(rs, 3)] = T1r + T1w;
Chris@42 361 Im[WS(rs, 3)] = T1y - T1x;
Chris@42 362 }
Chris@42 363 {
Chris@42 364 E TV, T11, T10, T12;
Chris@42 365 {
Chris@42 366 E TS, TU, TR, TT;
Chris@42 367 TS = T7 - Te;
Chris@42 368 TU = TO - TL;
Chris@42 369 TR = W[6];
Chris@42 370 TT = W[7];
Chris@42 371 TV = FNMS(TT, TU, TR * TS);
Chris@42 372 T11 = FMA(TT, TS, TR * TU);
Chris@42 373 }
Chris@42 374 {
Chris@42 375 E TX, TZ, TW, TY;
Chris@42 376 TX = Tl - Tw;
Chris@42 377 TZ = TG - TB;
Chris@42 378 TW = W[8];
Chris@42 379 TY = W[9];
Chris@42 380 T10 = FMA(TW, TX, TY * TZ);
Chris@42 381 T12 = FNMS(TY, TX, TW * TZ);
Chris@42 382 }
Chris@42 383 Rp[WS(rs, 2)] = TV - T10;
Chris@42 384 Ip[WS(rs, 2)] = T11 + T12;
Chris@42 385 Rm[WS(rs, 2)] = TV + T10;
Chris@42 386 Im[WS(rs, 2)] = T12 - T11;
Chris@42 387 }
Chris@42 388 {
Chris@42 389 E T1b, T1l, T1k, T1m;
Chris@42 390 {
Chris@42 391 E T16, T1a, T13, T17;
Chris@42 392 T16 = T14 + T15;
Chris@42 393 T1a = T18 + T19;
Chris@42 394 T13 = W[2];
Chris@42 395 T17 = W[3];
Chris@42 396 T1b = FNMS(T17, T1a, T13 * T16);
Chris@42 397 T1l = FMA(T17, T16, T13 * T1a);
Chris@42 398 }
Chris@42 399 {
Chris@42 400 E T1f, T1j, T1c, T1g;
Chris@42 401 T1f = T1d + T1e;
Chris@42 402 T1j = T1h - T1i;
Chris@42 403 T1c = W[4];
Chris@42 404 T1g = W[5];
Chris@42 405 T1k = FMA(T1c, T1f, T1g * T1j);
Chris@42 406 T1m = FNMS(T1g, T1f, T1c * T1j);
Chris@42 407 }
Chris@42 408 Rp[WS(rs, 1)] = T1b - T1k;
Chris@42 409 Ip[WS(rs, 1)] = T1l + T1m;
Chris@42 410 Rm[WS(rs, 1)] = T1b + T1k;
Chris@42 411 Im[WS(rs, 1)] = T1m - T1l;
Chris@42 412 }
Chris@42 413 }
Chris@42 414 }
Chris@42 415 }
Chris@42 416
Chris@42 417 static const tw_instr twinstr[] = {
Chris@42 418 {TW_FULL, 1, 8},
Chris@42 419 {TW_NEXT, 1, 0}
Chris@42 420 };
Chris@42 421
Chris@42 422 static const hc2c_desc desc = { 8, "hc2cbdft2_8", twinstr, &GENUS, {68, 18, 14, 0} };
Chris@42 423
Chris@42 424 void X(codelet_hc2cbdft2_8) (planner *p) {
Chris@42 425 X(khc2c_register) (p, hc2cbdft2_8, &desc, HC2C_VIA_DFT);
Chris@42 426 }
Chris@42 427 #endif /* HAVE_FMA */