annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cf2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:28 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@42 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@42 33 * 64 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 43 E TS, T1m, TJ, T1l, T1k, Tw, T1w, T1u;
Chris@42 44 {
Chris@42 45 E T2, T3, Tl, Tn, T5, T4, Tm, Tr, T6;
Chris@42 46 T2 = W[0];
Chris@42 47 T3 = W[2];
Chris@42 48 Tl = W[4];
Chris@42 49 Tn = W[5];
Chris@42 50 T5 = W[1];
Chris@42 51 T4 = T2 * T3;
Chris@42 52 Tm = T2 * Tl;
Chris@42 53 Tr = T2 * Tn;
Chris@42 54 T6 = W[3];
Chris@42 55 {
Chris@42 56 E T1, T1s, TG, Td, T1r, Tu, TY, Tk, TW, T18, T1d, TD, TH, TA, T13;
Chris@42 57 E TE, T14;
Chris@42 58 {
Chris@42 59 E To, Ts, Tf, T7, T8, Ti, Tb, T9, Tc, TC, Ta, TF, TB, Tg, Th;
Chris@42 60 E Tj;
Chris@42 61 T1 = Rp[0];
Chris@42 62 To = FMA(T5, Tn, Tm);
Chris@42 63 Ts = FNMS(T5, Tl, Tr);
Chris@42 64 Tf = FMA(T5, T6, T4);
Chris@42 65 T7 = FNMS(T5, T6, T4);
Chris@42 66 Ta = T2 * T6;
Chris@42 67 T1s = Rm[0];
Chris@42 68 T8 = Rp[WS(rs, 2)];
Chris@42 69 TF = Tf * Tn;
Chris@42 70 TB = Tf * Tl;
Chris@42 71 Ti = FNMS(T5, T3, Ta);
Chris@42 72 Tb = FMA(T5, T3, Ta);
Chris@42 73 T9 = T7 * T8;
Chris@42 74 Tc = Rm[WS(rs, 2)];
Chris@42 75 TG = FNMS(Ti, Tl, TF);
Chris@42 76 TC = FMA(Ti, Tn, TB);
Chris@42 77 {
Chris@42 78 E Tp, T1q, Tt, Tq, TX;
Chris@42 79 Tp = Rp[WS(rs, 3)];
Chris@42 80 Td = FMA(Tb, Tc, T9);
Chris@42 81 T1q = T7 * Tc;
Chris@42 82 Tt = Rm[WS(rs, 3)];
Chris@42 83 Tq = To * Tp;
Chris@42 84 Tg = Rp[WS(rs, 1)];
Chris@42 85 T1r = FNMS(Tb, T8, T1q);
Chris@42 86 TX = To * Tt;
Chris@42 87 Tu = FMA(Ts, Tt, Tq);
Chris@42 88 Th = Tf * Tg;
Chris@42 89 Tj = Rm[WS(rs, 1)];
Chris@42 90 TY = FNMS(Ts, Tp, TX);
Chris@42 91 }
Chris@42 92 {
Chris@42 93 E TO, TQ, TN, TP, T1a, T1b;
Chris@42 94 {
Chris@42 95 E TK, TM, TL, T19, TV;
Chris@42 96 TK = Ip[WS(rs, 3)];
Chris@42 97 TM = Im[WS(rs, 3)];
Chris@42 98 Tk = FMA(Ti, Tj, Th);
Chris@42 99 TV = Tf * Tj;
Chris@42 100 TL = Tl * TK;
Chris@42 101 T19 = Tl * TM;
Chris@42 102 TO = Ip[WS(rs, 1)];
Chris@42 103 TW = FNMS(Ti, Tg, TV);
Chris@42 104 TQ = Im[WS(rs, 1)];
Chris@42 105 TN = FMA(Tn, TM, TL);
Chris@42 106 TP = T3 * TO;
Chris@42 107 T1a = FNMS(Tn, TK, T19);
Chris@42 108 T1b = T3 * TQ;
Chris@42 109 }
Chris@42 110 {
Chris@42 111 E Tx, Tz, Ty, T12, T1c, TR;
Chris@42 112 Tx = Ip[0];
Chris@42 113 TR = FMA(T6, TQ, TP);
Chris@42 114 Tz = Im[0];
Chris@42 115 T1c = FNMS(T6, TO, T1b);
Chris@42 116 Ty = T2 * Tx;
Chris@42 117 T18 = TN - TR;
Chris@42 118 TS = TN + TR;
Chris@42 119 T12 = T2 * Tz;
Chris@42 120 T1d = T1a - T1c;
Chris@42 121 T1m = T1a + T1c;
Chris@42 122 TD = Ip[WS(rs, 2)];
Chris@42 123 TH = Im[WS(rs, 2)];
Chris@42 124 TA = FMA(T5, Tz, Ty);
Chris@42 125 T13 = FNMS(T5, Tx, T12);
Chris@42 126 TE = TC * TD;
Chris@42 127 T14 = TC * TH;
Chris@42 128 }
Chris@42 129 }
Chris@42 130 }
Chris@42 131 {
Chris@42 132 E Te, T1p, T1t, Tv;
Chris@42 133 {
Chris@42 134 E T1g, T10, T1z, T1B, T1A, T1j, T1C, T1f;
Chris@42 135 {
Chris@42 136 E T1x, T11, T16, T1y;
Chris@42 137 {
Chris@42 138 E TU, TZ, TI, T15;
Chris@42 139 Te = T1 + Td;
Chris@42 140 TU = T1 - Td;
Chris@42 141 TZ = TW - TY;
Chris@42 142 T1p = TW + TY;
Chris@42 143 TI = FMA(TG, TH, TE);
Chris@42 144 T15 = FNMS(TG, TD, T14);
Chris@42 145 T1t = T1r + T1s;
Chris@42 146 T1x = T1s - T1r;
Chris@42 147 T1g = TU - TZ;
Chris@42 148 T10 = TU + TZ;
Chris@42 149 T11 = TA - TI;
Chris@42 150 TJ = TA + TI;
Chris@42 151 T1l = T13 + T15;
Chris@42 152 T16 = T13 - T15;
Chris@42 153 T1y = Tk - Tu;
Chris@42 154 Tv = Tk + Tu;
Chris@42 155 }
Chris@42 156 {
Chris@42 157 E T1i, T1e, T17, T1h;
Chris@42 158 T1i = T18 + T1d;
Chris@42 159 T1e = T18 - T1d;
Chris@42 160 T17 = T11 + T16;
Chris@42 161 T1h = T16 - T11;
Chris@42 162 T1z = T1x - T1y;
Chris@42 163 T1B = T1y + T1x;
Chris@42 164 T1A = T1h + T1i;
Chris@42 165 T1j = T1h - T1i;
Chris@42 166 T1C = T1e - T17;
Chris@42 167 T1f = T17 + T1e;
Chris@42 168 }
Chris@42 169 }
Chris@42 170 Rm[0] = FNMS(KP707106781, T1j, T1g);
Chris@42 171 Im[0] = FMS(KP707106781, T1C, T1B);
Chris@42 172 Rp[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
Chris@42 173 Rm[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
Chris@42 174 Ip[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
Chris@42 175 Im[WS(rs, 2)] = FMS(KP707106781, T1A, T1z);
Chris@42 176 Rp[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
Chris@42 177 Ip[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
Chris@42 178 }
Chris@42 179 T1k = Te - Tv;
Chris@42 180 Tw = Te + Tv;
Chris@42 181 T1w = T1t - T1p;
Chris@42 182 T1u = T1p + T1t;
Chris@42 183 }
Chris@42 184 }
Chris@42 185 }
Chris@42 186 {
Chris@42 187 E TT, T1v, T1n, T1o;
Chris@42 188 TT = TJ + TS;
Chris@42 189 T1v = TS - TJ;
Chris@42 190 T1n = T1l - T1m;
Chris@42 191 T1o = T1l + T1m;
Chris@42 192 Ip[WS(rs, 2)] = T1v + T1w;
Chris@42 193 Im[WS(rs, 1)] = T1v - T1w;
Chris@42 194 Rp[0] = Tw + TT;
Chris@42 195 Rm[WS(rs, 3)] = Tw - TT;
Chris@42 196 Ip[0] = T1o + T1u;
Chris@42 197 Im[WS(rs, 3)] = T1o - T1u;
Chris@42 198 Rp[WS(rs, 2)] = T1k + T1n;
Chris@42 199 Rm[WS(rs, 1)] = T1k - T1n;
Chris@42 200 }
Chris@42 201 }
Chris@42 202 }
Chris@42 203 }
Chris@42 204
Chris@42 205 static const tw_instr twinstr[] = {
Chris@42 206 {TW_CEXP, 1, 1},
Chris@42 207 {TW_CEXP, 1, 3},
Chris@42 208 {TW_CEXP, 1, 7},
Chris@42 209 {TW_NEXT, 1, 0}
Chris@42 210 };
Chris@42 211
Chris@42 212 static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, {44, 20, 30, 0} };
Chris@42 213
Chris@42 214 void X(codelet_hc2cf2_8) (planner *p) {
Chris@42 215 X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
Chris@42 216 }
Chris@42 217 #else /* HAVE_FMA */
Chris@42 218
Chris@42 219 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include hc2cf.h */
Chris@42 220
Chris@42 221 /*
Chris@42 222 * This function contains 74 FP additions, 44 FP multiplications,
Chris@42 223 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@42 224 * 42 stack variables, 1 constants, and 32 memory accesses
Chris@42 225 */
Chris@42 226 #include "hc2cf.h"
Chris@42 227
Chris@42 228 static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 229 {
Chris@42 230 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 231 {
Chris@42 232 INT m;
Chris@42 233 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 234 E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
Chris@42 235 {
Chris@42 236 E T4, Tb, T7, Ta;
Chris@42 237 T2 = W[0];
Chris@42 238 T5 = W[1];
Chris@42 239 T3 = W[2];
Chris@42 240 T6 = W[3];
Chris@42 241 T4 = T2 * T3;
Chris@42 242 Tb = T5 * T3;
Chris@42 243 T7 = T5 * T6;
Chris@42 244 Ta = T2 * T6;
Chris@42 245 T8 = T4 - T7;
Chris@42 246 Tc = Ta + Tb;
Chris@42 247 Tg = T4 + T7;
Chris@42 248 Ti = Ta - Tb;
Chris@42 249 Tl = W[4];
Chris@42 250 Tm = W[5];
Chris@42 251 Tn = FMA(T2, Tl, T5 * Tm);
Chris@42 252 Tz = FNMS(Ti, Tl, Tg * Tm);
Chris@42 253 Tp = FNMS(T5, Tl, T2 * Tm);
Chris@42 254 Tx = FMA(Tg, Tl, Ti * Tm);
Chris@42 255 }
Chris@42 256 {
Chris@42 257 E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
Chris@42 258 E TT;
Chris@42 259 {
Chris@42 260 E T1, T1c, Te, T1b, T9, Td;
Chris@42 261 T1 = Rp[0];
Chris@42 262 T1c = Rm[0];
Chris@42 263 T9 = Rp[WS(rs, 2)];
Chris@42 264 Td = Rm[WS(rs, 2)];
Chris@42 265 Te = FMA(T8, T9, Tc * Td);
Chris@42 266 T1b = FNMS(Tc, T9, T8 * Td);
Chris@42 267 Tf = T1 + Te;
Chris@42 268 T1i = T1c - T1b;
Chris@42 269 TL = T1 - Te;
Chris@42 270 T1d = T1b + T1c;
Chris@42 271 }
Chris@42 272 {
Chris@42 273 E TF, TW, TI, TX;
Chris@42 274 {
Chris@42 275 E TD, TE, TG, TH;
Chris@42 276 TD = Ip[WS(rs, 3)];
Chris@42 277 TE = Im[WS(rs, 3)];
Chris@42 278 TF = FMA(Tl, TD, Tm * TE);
Chris@42 279 TW = FNMS(Tm, TD, Tl * TE);
Chris@42 280 TG = Ip[WS(rs, 1)];
Chris@42 281 TH = Im[WS(rs, 1)];
Chris@42 282 TI = FMA(T3, TG, T6 * TH);
Chris@42 283 TX = FNMS(T6, TG, T3 * TH);
Chris@42 284 }
Chris@42 285 TJ = TF + TI;
Chris@42 286 T17 = TW + TX;
Chris@42 287 TV = TF - TI;
Chris@42 288 TY = TW - TX;
Chris@42 289 }
Chris@42 290 {
Chris@42 291 E Tk, TM, Tr, TN;
Chris@42 292 {
Chris@42 293 E Th, Tj, To, Tq;
Chris@42 294 Th = Rp[WS(rs, 1)];
Chris@42 295 Tj = Rm[WS(rs, 1)];
Chris@42 296 Tk = FMA(Tg, Th, Ti * Tj);
Chris@42 297 TM = FNMS(Ti, Th, Tg * Tj);
Chris@42 298 To = Rp[WS(rs, 3)];
Chris@42 299 Tq = Rm[WS(rs, 3)];
Chris@42 300 Tr = FMA(Tn, To, Tp * Tq);
Chris@42 301 TN = FNMS(Tp, To, Tn * Tq);
Chris@42 302 }
Chris@42 303 Ts = Tk + Tr;
Chris@42 304 T1j = Tk - Tr;
Chris@42 305 TO = TM - TN;
Chris@42 306 T1a = TM + TN;
Chris@42 307 }
Chris@42 308 {
Chris@42 309 E Tw, TR, TB, TS;
Chris@42 310 {
Chris@42 311 E Tu, Tv, Ty, TA;
Chris@42 312 Tu = Ip[0];
Chris@42 313 Tv = Im[0];
Chris@42 314 Tw = FMA(T2, Tu, T5 * Tv);
Chris@42 315 TR = FNMS(T5, Tu, T2 * Tv);
Chris@42 316 Ty = Ip[WS(rs, 2)];
Chris@42 317 TA = Im[WS(rs, 2)];
Chris@42 318 TB = FMA(Tx, Ty, Tz * TA);
Chris@42 319 TS = FNMS(Tz, Ty, Tx * TA);
Chris@42 320 }
Chris@42 321 TC = Tw + TB;
Chris@42 322 T16 = TR + TS;
Chris@42 323 TQ = Tw - TB;
Chris@42 324 TT = TR - TS;
Chris@42 325 }
Chris@42 326 {
Chris@42 327 E Tt, TK, T1f, T1g;
Chris@42 328 Tt = Tf + Ts;
Chris@42 329 TK = TC + TJ;
Chris@42 330 Rm[WS(rs, 3)] = Tt - TK;
Chris@42 331 Rp[0] = Tt + TK;
Chris@42 332 {
Chris@42 333 E T19, T1e, T15, T18;
Chris@42 334 T19 = T16 + T17;
Chris@42 335 T1e = T1a + T1d;
Chris@42 336 Im[WS(rs, 3)] = T19 - T1e;
Chris@42 337 Ip[0] = T19 + T1e;
Chris@42 338 T15 = Tf - Ts;
Chris@42 339 T18 = T16 - T17;
Chris@42 340 Rm[WS(rs, 1)] = T15 - T18;
Chris@42 341 Rp[WS(rs, 2)] = T15 + T18;
Chris@42 342 }
Chris@42 343 T1f = TJ - TC;
Chris@42 344 T1g = T1d - T1a;
Chris@42 345 Im[WS(rs, 1)] = T1f - T1g;
Chris@42 346 Ip[WS(rs, 2)] = T1f + T1g;
Chris@42 347 {
Chris@42 348 E T11, T1k, T14, T1h, T12, T13;
Chris@42 349 T11 = TL - TO;
Chris@42 350 T1k = T1i - T1j;
Chris@42 351 T12 = TT - TQ;
Chris@42 352 T13 = TV + TY;
Chris@42 353 T14 = KP707106781 * (T12 - T13);
Chris@42 354 T1h = KP707106781 * (T12 + T13);
Chris@42 355 Rm[0] = T11 - T14;
Chris@42 356 Ip[WS(rs, 1)] = T1h + T1k;
Chris@42 357 Rp[WS(rs, 3)] = T11 + T14;
Chris@42 358 Im[WS(rs, 2)] = T1h - T1k;
Chris@42 359 }
Chris@42 360 {
Chris@42 361 E TP, T1m, T10, T1l, TU, TZ;
Chris@42 362 TP = TL + TO;
Chris@42 363 T1m = T1j + T1i;
Chris@42 364 TU = TQ + TT;
Chris@42 365 TZ = TV - TY;
Chris@42 366 T10 = KP707106781 * (TU + TZ);
Chris@42 367 T1l = KP707106781 * (TZ - TU);
Chris@42 368 Rm[WS(rs, 2)] = TP - T10;
Chris@42 369 Ip[WS(rs, 3)] = T1l + T1m;
Chris@42 370 Rp[WS(rs, 1)] = TP + T10;
Chris@42 371 Im[0] = T1l - T1m;
Chris@42 372 }
Chris@42 373 }
Chris@42 374 }
Chris@42 375 }
Chris@42 376 }
Chris@42 377 }
Chris@42 378
Chris@42 379 static const tw_instr twinstr[] = {
Chris@42 380 {TW_CEXP, 1, 1},
Chris@42 381 {TW_CEXP, 1, 3},
Chris@42 382 {TW_CEXP, 1, 7},
Chris@42 383 {TW_NEXT, 1, 0}
Chris@42 384 };
Chris@42 385
Chris@42 386 static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, {56, 26, 18, 0} };
Chris@42 387
Chris@42 388 void X(codelet_hc2cf2_8) (planner *p) {
Chris@42 389 X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
Chris@42 390 }
Chris@42 391 #endif /* HAVE_FMA */