annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cfdft_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:41 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 82 FP additions, 52 FP multiplications,
Chris@42 32 * (or, 60 additions, 30 multiplications, 22 fused multiply/add),
Chris@42 33 * 55 stack variables, 2 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT m;
Chris@42 43 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 44 E T1A, T1w, T1z, T1x, T1H, T1v, T1L, T1F;
Chris@42 45 {
Chris@42 46 E Ty, T14, TO, T1o, Tv, TG, T16, T1m, Ta, T19, T1h, TV, T10, TX, TZ;
Chris@42 47 E Tk, T1i, TY, T1b, TF, TB, T1l;
Chris@42 48 {
Chris@42 49 E TH, TN, TK, TM;
Chris@42 50 {
Chris@42 51 E Tw, Tx, TI, TJ;
Chris@42 52 Tw = Ip[0];
Chris@42 53 Tx = Im[0];
Chris@42 54 TI = Rm[0];
Chris@42 55 TJ = Rp[0];
Chris@42 56 TH = W[0];
Chris@42 57 Ty = Tw - Tx;
Chris@42 58 TN = Tw + Tx;
Chris@42 59 T14 = TJ + TI;
Chris@42 60 TK = TI - TJ;
Chris@42 61 TM = W[1];
Chris@42 62 }
Chris@42 63 {
Chris@42 64 E Ts, Tp, Tt, Tm, Tr;
Chris@42 65 {
Chris@42 66 E Tn, To, TL, T1n;
Chris@42 67 Tn = Ip[WS(rs, 2)];
Chris@42 68 To = Im[WS(rs, 2)];
Chris@42 69 TL = TH * TK;
Chris@42 70 T1n = TM * TK;
Chris@42 71 Ts = Rp[WS(rs, 2)];
Chris@42 72 TF = Tn + To;
Chris@42 73 Tp = Tn - To;
Chris@42 74 TO = FNMS(TM, TN, TL);
Chris@42 75 T1o = FMA(TH, TN, T1n);
Chris@42 76 Tt = Rm[WS(rs, 2)];
Chris@42 77 }
Chris@42 78 Tm = W[6];
Chris@42 79 Tr = W[7];
Chris@42 80 {
Chris@42 81 E TE, TD, T15, TC, Tu, Tq;
Chris@42 82 TB = W[8];
Chris@42 83 TC = Tt - Ts;
Chris@42 84 Tu = Ts + Tt;
Chris@42 85 Tq = Tm * Tp;
Chris@42 86 TE = W[9];
Chris@42 87 TD = TB * TC;
Chris@42 88 T15 = Tm * Tu;
Chris@42 89 Tv = FNMS(Tr, Tu, Tq);
Chris@42 90 T1l = TE * TC;
Chris@42 91 TG = FNMS(TE, TF, TD);
Chris@42 92 T16 = FMA(Tr, Tp, T15);
Chris@42 93 }
Chris@42 94 }
Chris@42 95 }
Chris@42 96 {
Chris@42 97 E TU, TR, TT, T1g, TS;
Chris@42 98 {
Chris@42 99 E T2, T3, T7, T8;
Chris@42 100 T2 = Ip[WS(rs, 1)];
Chris@42 101 T1m = FMA(TB, TF, T1l);
Chris@42 102 T3 = Im[WS(rs, 1)];
Chris@42 103 T7 = Rp[WS(rs, 1)];
Chris@42 104 T8 = Rm[WS(rs, 1)];
Chris@42 105 {
Chris@42 106 E T1, T4, T9, T6, T5, TQ, T18;
Chris@42 107 T1 = W[2];
Chris@42 108 TU = T2 + T3;
Chris@42 109 T4 = T2 - T3;
Chris@42 110 TR = T7 - T8;
Chris@42 111 T9 = T7 + T8;
Chris@42 112 T6 = W[3];
Chris@42 113 T5 = T1 * T4;
Chris@42 114 TQ = W[4];
Chris@42 115 T18 = T1 * T9;
Chris@42 116 TT = W[5];
Chris@42 117 Ta = FNMS(T6, T9, T5);
Chris@42 118 T1g = TQ * TU;
Chris@42 119 TS = TQ * TR;
Chris@42 120 T19 = FMA(T6, T4, T18);
Chris@42 121 }
Chris@42 122 }
Chris@42 123 {
Chris@42 124 E Tc, Td, Th, Ti;
Chris@42 125 Tc = Ip[WS(rs, 3)];
Chris@42 126 T1h = FNMS(TT, TR, T1g);
Chris@42 127 TV = FMA(TT, TU, TS);
Chris@42 128 Td = Im[WS(rs, 3)];
Chris@42 129 Th = Rp[WS(rs, 3)];
Chris@42 130 Ti = Rm[WS(rs, 3)];
Chris@42 131 {
Chris@42 132 E Tb, Te, Tj, Tg, Tf, TW, T1a;
Chris@42 133 Tb = W[10];
Chris@42 134 T10 = Tc + Td;
Chris@42 135 Te = Tc - Td;
Chris@42 136 TX = Th - Ti;
Chris@42 137 Tj = Th + Ti;
Chris@42 138 Tg = W[11];
Chris@42 139 Tf = Tb * Te;
Chris@42 140 TW = W[12];
Chris@42 141 T1a = Tb * Tj;
Chris@42 142 TZ = W[13];
Chris@42 143 Tk = FNMS(Tg, Tj, Tf);
Chris@42 144 T1i = TW * T10;
Chris@42 145 TY = TW * TX;
Chris@42 146 T1b = FMA(Tg, Te, T1a);
Chris@42 147 }
Chris@42 148 }
Chris@42 149 }
Chris@42 150 {
Chris@42 151 E T1E, T1t, TA, T1s, T1D, T1u, T1e, T13, T1r, T1d;
Chris@42 152 {
Chris@42 153 E TP, T1f, T1q, T12, T17, T1c;
Chris@42 154 {
Chris@42 155 E Tl, T11, Tz, T1p, T1k, T1j;
Chris@42 156 T1E = Ta - Tk;
Chris@42 157 Tl = Ta + Tk;
Chris@42 158 T1j = FNMS(TZ, TX, T1i);
Chris@42 159 T11 = FMA(TZ, T10, TY);
Chris@42 160 Tz = Tv + Ty;
Chris@42 161 T1t = Ty - Tv;
Chris@42 162 T1A = T1o - T1m;
Chris@42 163 T1p = T1m + T1o;
Chris@42 164 T1k = T1h + T1j;
Chris@42 165 T1w = T1j - T1h;
Chris@42 166 T1z = TO - TG;
Chris@42 167 TP = TG + TO;
Chris@42 168 T1f = Tz - Tl;
Chris@42 169 TA = Tl + Tz;
Chris@42 170 T1s = T1k + T1p;
Chris@42 171 T1q = T1k - T1p;
Chris@42 172 T12 = TV + T11;
Chris@42 173 T1x = TV - T11;
Chris@42 174 T1D = T14 - T16;
Chris@42 175 T17 = T14 + T16;
Chris@42 176 T1c = T19 + T1b;
Chris@42 177 T1u = T19 - T1b;
Chris@42 178 }
Chris@42 179 Im[WS(rs, 1)] = KP500000000 * (T1q - T1f);
Chris@42 180 T1e = T12 + TP;
Chris@42 181 T13 = TP - T12;
Chris@42 182 T1r = T17 + T1c;
Chris@42 183 T1d = T17 - T1c;
Chris@42 184 Ip[WS(rs, 2)] = KP500000000 * (T1f + T1q);
Chris@42 185 }
Chris@42 186 Im[WS(rs, 3)] = KP500000000 * (T13 - TA);
Chris@42 187 Ip[0] = KP500000000 * (TA + T13);
Chris@42 188 Rm[WS(rs, 3)] = KP500000000 * (T1r - T1s);
Chris@42 189 Rp[0] = KP500000000 * (T1r + T1s);
Chris@42 190 Rp[WS(rs, 2)] = KP500000000 * (T1d + T1e);
Chris@42 191 Rm[WS(rs, 1)] = KP500000000 * (T1d - T1e);
Chris@42 192 T1H = T1u + T1t;
Chris@42 193 T1v = T1t - T1u;
Chris@42 194 T1L = T1D + T1E;
Chris@42 195 T1F = T1D - T1E;
Chris@42 196 }
Chris@42 197 }
Chris@42 198 {
Chris@42 199 E T1y, T1I, T1B, T1J;
Chris@42 200 T1y = T1w + T1x;
Chris@42 201 T1I = T1w - T1x;
Chris@42 202 T1B = T1z - T1A;
Chris@42 203 T1J = T1z + T1A;
Chris@42 204 {
Chris@42 205 E T1M, T1K, T1C, T1G;
Chris@42 206 T1M = T1I + T1J;
Chris@42 207 T1K = T1I - T1J;
Chris@42 208 T1C = T1y + T1B;
Chris@42 209 T1G = T1B - T1y;
Chris@42 210 Im[0] = -(KP500000000 * (FNMS(KP707106781, T1K, T1H)));
Chris@42 211 Ip[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1K, T1H));
Chris@42 212 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1M, T1L));
Chris@42 213 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP707106781, T1M, T1L));
Chris@42 214 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1G, T1F));
Chris@42 215 Rm[0] = KP500000000 * (FNMS(KP707106781, T1G, T1F));
Chris@42 216 Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP707106781, T1C, T1v)));
Chris@42 217 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1C, T1v));
Chris@42 218 }
Chris@42 219 }
Chris@42 220 }
Chris@42 221 }
Chris@42 222 }
Chris@42 223
Chris@42 224 static const tw_instr twinstr[] = {
Chris@42 225 {TW_FULL, 1, 8},
Chris@42 226 {TW_NEXT, 1, 0}
Chris@42 227 };
Chris@42 228
Chris@42 229 static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, {60, 30, 22, 0} };
Chris@42 230
Chris@42 231 void X(codelet_hc2cfdft_8) (planner *p) {
Chris@42 232 X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
Chris@42 233 }
Chris@42 234 #else /* HAVE_FMA */
Chris@42 235
Chris@42 236 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include hc2cf.h */
Chris@42 237
Chris@42 238 /*
Chris@42 239 * This function contains 82 FP additions, 44 FP multiplications,
Chris@42 240 * (or, 68 additions, 30 multiplications, 14 fused multiply/add),
Chris@42 241 * 39 stack variables, 2 constants, and 32 memory accesses
Chris@42 242 */
Chris@42 243 #include "hc2cf.h"
Chris@42 244
Chris@42 245 static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 246 {
Chris@42 247 DK(KP353553390, +0.353553390593273762200422181052424519642417969);
Chris@42 248 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 249 {
Chris@42 250 INT m;
Chris@42 251 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 252 E Tv, TX, Ts, TY, TE, T1a, TJ, T19, T1l, T1m, T9, T10, Ti, T11, TP;
Chris@42 253 E T16, TU, T17, T1i, T1j;
Chris@42 254 {
Chris@42 255 E Tt, Tu, TD, Tz, TA, TB, Tn, TI, Tr, TG, Tk, To;
Chris@42 256 Tt = Ip[0];
Chris@42 257 Tu = Im[0];
Chris@42 258 TD = Tt + Tu;
Chris@42 259 Tz = Rm[0];
Chris@42 260 TA = Rp[0];
Chris@42 261 TB = Tz - TA;
Chris@42 262 {
Chris@42 263 E Tl, Tm, Tp, Tq;
Chris@42 264 Tl = Ip[WS(rs, 2)];
Chris@42 265 Tm = Im[WS(rs, 2)];
Chris@42 266 Tn = Tl - Tm;
Chris@42 267 TI = Tl + Tm;
Chris@42 268 Tp = Rp[WS(rs, 2)];
Chris@42 269 Tq = Rm[WS(rs, 2)];
Chris@42 270 Tr = Tp + Tq;
Chris@42 271 TG = Tp - Tq;
Chris@42 272 }
Chris@42 273 Tv = Tt - Tu;
Chris@42 274 TX = TA + Tz;
Chris@42 275 Tk = W[6];
Chris@42 276 To = W[7];
Chris@42 277 Ts = FNMS(To, Tr, Tk * Tn);
Chris@42 278 TY = FMA(Tk, Tr, To * Tn);
Chris@42 279 {
Chris@42 280 E Ty, TC, TF, TH;
Chris@42 281 Ty = W[0];
Chris@42 282 TC = W[1];
Chris@42 283 TE = FNMS(TC, TD, Ty * TB);
Chris@42 284 T1a = FMA(TC, TB, Ty * TD);
Chris@42 285 TF = W[8];
Chris@42 286 TH = W[9];
Chris@42 287 TJ = FMA(TF, TG, TH * TI);
Chris@42 288 T19 = FNMS(TH, TG, TF * TI);
Chris@42 289 }
Chris@42 290 T1l = TJ + TE;
Chris@42 291 T1m = T1a - T19;
Chris@42 292 }
Chris@42 293 {
Chris@42 294 E T4, TO, T8, TM, Td, TT, Th, TR;
Chris@42 295 {
Chris@42 296 E T2, T3, T6, T7;
Chris@42 297 T2 = Ip[WS(rs, 1)];
Chris@42 298 T3 = Im[WS(rs, 1)];
Chris@42 299 T4 = T2 - T3;
Chris@42 300 TO = T2 + T3;
Chris@42 301 T6 = Rp[WS(rs, 1)];
Chris@42 302 T7 = Rm[WS(rs, 1)];
Chris@42 303 T8 = T6 + T7;
Chris@42 304 TM = T6 - T7;
Chris@42 305 }
Chris@42 306 {
Chris@42 307 E Tb, Tc, Tf, Tg;
Chris@42 308 Tb = Ip[WS(rs, 3)];
Chris@42 309 Tc = Im[WS(rs, 3)];
Chris@42 310 Td = Tb - Tc;
Chris@42 311 TT = Tb + Tc;
Chris@42 312 Tf = Rp[WS(rs, 3)];
Chris@42 313 Tg = Rm[WS(rs, 3)];
Chris@42 314 Th = Tf + Tg;
Chris@42 315 TR = Tf - Tg;
Chris@42 316 }
Chris@42 317 {
Chris@42 318 E T1, T5, Ta, Te;
Chris@42 319 T1 = W[2];
Chris@42 320 T5 = W[3];
Chris@42 321 T9 = FNMS(T5, T8, T1 * T4);
Chris@42 322 T10 = FMA(T1, T8, T5 * T4);
Chris@42 323 Ta = W[10];
Chris@42 324 Te = W[11];
Chris@42 325 Ti = FNMS(Te, Th, Ta * Td);
Chris@42 326 T11 = FMA(Ta, Th, Te * Td);
Chris@42 327 {
Chris@42 328 E TL, TN, TQ, TS;
Chris@42 329 TL = W[4];
Chris@42 330 TN = W[5];
Chris@42 331 TP = FMA(TL, TM, TN * TO);
Chris@42 332 T16 = FNMS(TN, TM, TL * TO);
Chris@42 333 TQ = W[12];
Chris@42 334 TS = W[13];
Chris@42 335 TU = FMA(TQ, TR, TS * TT);
Chris@42 336 T17 = FNMS(TS, TR, TQ * TT);
Chris@42 337 }
Chris@42 338 T1i = T17 - T16;
Chris@42 339 T1j = TP - TU;
Chris@42 340 }
Chris@42 341 }
Chris@42 342 {
Chris@42 343 E T1h, T1t, T1w, T1y, T1o, T1s, T1r, T1x;
Chris@42 344 {
Chris@42 345 E T1f, T1g, T1u, T1v;
Chris@42 346 T1f = Tv - Ts;
Chris@42 347 T1g = T10 - T11;
Chris@42 348 T1h = KP500000000 * (T1f - T1g);
Chris@42 349 T1t = KP500000000 * (T1g + T1f);
Chris@42 350 T1u = T1i - T1j;
Chris@42 351 T1v = T1l + T1m;
Chris@42 352 T1w = KP353553390 * (T1u - T1v);
Chris@42 353 T1y = KP353553390 * (T1u + T1v);
Chris@42 354 }
Chris@42 355 {
Chris@42 356 E T1k, T1n, T1p, T1q;
Chris@42 357 T1k = T1i + T1j;
Chris@42 358 T1n = T1l - T1m;
Chris@42 359 T1o = KP353553390 * (T1k + T1n);
Chris@42 360 T1s = KP353553390 * (T1n - T1k);
Chris@42 361 T1p = TX - TY;
Chris@42 362 T1q = T9 - Ti;
Chris@42 363 T1r = KP500000000 * (T1p - T1q);
Chris@42 364 T1x = KP500000000 * (T1p + T1q);
Chris@42 365 }
Chris@42 366 Ip[WS(rs, 1)] = T1h + T1o;
Chris@42 367 Rp[WS(rs, 1)] = T1x + T1y;
Chris@42 368 Im[WS(rs, 2)] = T1o - T1h;
Chris@42 369 Rm[WS(rs, 2)] = T1x - T1y;
Chris@42 370 Rm[0] = T1r - T1s;
Chris@42 371 Im[0] = T1w - T1t;
Chris@42 372 Rp[WS(rs, 3)] = T1r + T1s;
Chris@42 373 Ip[WS(rs, 3)] = T1t + T1w;
Chris@42 374 }
Chris@42 375 {
Chris@42 376 E Tx, T15, T1c, T1e, TW, T14, T13, T1d;
Chris@42 377 {
Chris@42 378 E Tj, Tw, T18, T1b;
Chris@42 379 Tj = T9 + Ti;
Chris@42 380 Tw = Ts + Tv;
Chris@42 381 Tx = Tj + Tw;
Chris@42 382 T15 = Tw - Tj;
Chris@42 383 T18 = T16 + T17;
Chris@42 384 T1b = T19 + T1a;
Chris@42 385 T1c = T18 - T1b;
Chris@42 386 T1e = T18 + T1b;
Chris@42 387 }
Chris@42 388 {
Chris@42 389 E TK, TV, TZ, T12;
Chris@42 390 TK = TE - TJ;
Chris@42 391 TV = TP + TU;
Chris@42 392 TW = TK - TV;
Chris@42 393 T14 = TV + TK;
Chris@42 394 TZ = TX + TY;
Chris@42 395 T12 = T10 + T11;
Chris@42 396 T13 = TZ - T12;
Chris@42 397 T1d = TZ + T12;
Chris@42 398 }
Chris@42 399 Ip[0] = KP500000000 * (Tx + TW);
Chris@42 400 Rp[0] = KP500000000 * (T1d + T1e);
Chris@42 401 Im[WS(rs, 3)] = KP500000000 * (TW - Tx);
Chris@42 402 Rm[WS(rs, 3)] = KP500000000 * (T1d - T1e);
Chris@42 403 Rm[WS(rs, 1)] = KP500000000 * (T13 - T14);
Chris@42 404 Im[WS(rs, 1)] = KP500000000 * (T1c - T15);
Chris@42 405 Rp[WS(rs, 2)] = KP500000000 * (T13 + T14);
Chris@42 406 Ip[WS(rs, 2)] = KP500000000 * (T15 + T1c);
Chris@42 407 }
Chris@42 408 }
Chris@42 409 }
Chris@42 410 }
Chris@42 411
Chris@42 412 static const tw_instr twinstr[] = {
Chris@42 413 {TW_FULL, 1, 8},
Chris@42 414 {TW_NEXT, 1, 0}
Chris@42 415 };
Chris@42 416
Chris@42 417 static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, {68, 30, 14, 0} };
Chris@42 418
Chris@42 419 void X(codelet_hc2cfdft_8) (planner *p) {
Chris@42 420 X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
Chris@42 421 }
Chris@42 422 #endif /* HAVE_FMA */