annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cb2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:39 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hc2cb2_8 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@42 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@42 33 * 64 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 43 E Tf, Ti, TK, Tq, TH, TT, TX, TW, TY, TU, TI;
Chris@42 44 {
Chris@42 45 E Tg, Tl, Tp, Th, T1n, T1t, Tj;
Chris@42 46 Tf = W[0];
Chris@42 47 Tg = W[2];
Chris@42 48 Tl = W[4];
Chris@42 49 Tp = W[5];
Chris@42 50 Ti = W[1];
Chris@42 51 Th = Tf * Tg;
Chris@42 52 T1n = Tf * Tl;
Chris@42 53 T1t = Tf * Tp;
Chris@42 54 Tj = W[3];
Chris@42 55 {
Chris@42 56 E T1o, T1u, Tk, T1b, To, T1e, T13, TP, T1p, T7, T1h, T1v, TZ, Tv, T1i;
Chris@42 57 E TB, TA, TQ, Te, T1w, TE, T1j;
Chris@42 58 {
Chris@42 59 E Tr, T3, Ts, T1f, TO, TL, T6, Tt;
Chris@42 60 {
Chris@42 61 E TM, TN, T4, T5;
Chris@42 62 {
Chris@42 63 E T1, Tn, T2, TJ, Tm;
Chris@42 64 T1 = Rp[0];
Chris@42 65 T1o = FMA(Ti, Tp, T1n);
Chris@42 66 T1u = FNMS(Ti, Tl, T1t);
Chris@42 67 Tk = FMA(Ti, Tj, Th);
Chris@42 68 T1b = FNMS(Ti, Tj, Th);
Chris@42 69 Tn = Tf * Tj;
Chris@42 70 T2 = Rm[WS(rs, 3)];
Chris@42 71 TM = Ip[0];
Chris@42 72 TJ = Tk * Tp;
Chris@42 73 Tm = Tk * Tl;
Chris@42 74 To = FNMS(Ti, Tg, Tn);
Chris@42 75 T1e = FMA(Ti, Tg, Tn);
Chris@42 76 Tr = T1 - T2;
Chris@42 77 T3 = T1 + T2;
Chris@42 78 TK = FNMS(To, Tl, TJ);
Chris@42 79 Tq = FMA(To, Tp, Tm);
Chris@42 80 TN = Im[WS(rs, 3)];
Chris@42 81 }
Chris@42 82 T4 = Rp[WS(rs, 2)];
Chris@42 83 T5 = Rm[WS(rs, 1)];
Chris@42 84 Ts = Ip[WS(rs, 2)];
Chris@42 85 T1f = TM - TN;
Chris@42 86 TO = TM + TN;
Chris@42 87 TL = T4 - T5;
Chris@42 88 T6 = T4 + T5;
Chris@42 89 Tt = Im[WS(rs, 1)];
Chris@42 90 }
Chris@42 91 {
Chris@42 92 E Tw, Ta, TC, Tz, Td, TD;
Chris@42 93 {
Chris@42 94 E Tx, Ty, Tb, Tc;
Chris@42 95 {
Chris@42 96 E T8, T1g, Tu, T9;
Chris@42 97 T8 = Rp[WS(rs, 1)];
Chris@42 98 T13 = TO - TL;
Chris@42 99 TP = TL + TO;
Chris@42 100 T1p = T3 - T6;
Chris@42 101 T7 = T3 + T6;
Chris@42 102 T1g = Ts - Tt;
Chris@42 103 Tu = Ts + Tt;
Chris@42 104 T9 = Rm[WS(rs, 2)];
Chris@42 105 Tx = Ip[WS(rs, 1)];
Chris@42 106 T1h = T1f + T1g;
Chris@42 107 T1v = T1f - T1g;
Chris@42 108 TZ = Tr + Tu;
Chris@42 109 Tv = Tr - Tu;
Chris@42 110 Tw = T8 - T9;
Chris@42 111 Ta = T8 + T9;
Chris@42 112 Ty = Im[WS(rs, 2)];
Chris@42 113 }
Chris@42 114 Tb = Rm[0];
Chris@42 115 Tc = Rp[WS(rs, 3)];
Chris@42 116 TC = Ip[WS(rs, 3)];
Chris@42 117 T1i = Tx - Ty;
Chris@42 118 Tz = Tx + Ty;
Chris@42 119 TB = Tb - Tc;
Chris@42 120 Td = Tb + Tc;
Chris@42 121 TD = Im[0];
Chris@42 122 }
Chris@42 123 TA = Tw - Tz;
Chris@42 124 TQ = Tw + Tz;
Chris@42 125 Te = Ta + Td;
Chris@42 126 T1w = Ta - Td;
Chris@42 127 TE = TC + TD;
Chris@42 128 T1j = TC - TD;
Chris@42 129 }
Chris@42 130 }
Chris@42 131 {
Chris@42 132 E T1x, T1k, T1r, TG, TS, T19, T15, T17, T11, T16, T12;
Chris@42 133 {
Chris@42 134 E T1B, T1z, T10, T1A, T1C;
Chris@42 135 T1x = T1v - T1w;
Chris@42 136 T1B = T1w + T1v;
Chris@42 137 Rp[0] = T7 + Te;
Chris@42 138 {
Chris@42 139 E T1q, TR, TF, T14;
Chris@42 140 T1k = T1i + T1j;
Chris@42 141 T1q = T1j - T1i;
Chris@42 142 TR = TB + TE;
Chris@42 143 TF = TB - TE;
Chris@42 144 T1r = T1p - T1q;
Chris@42 145 T1z = T1p + T1q;
Chris@42 146 Rm[0] = T1h + T1k;
Chris@42 147 TG = TA + TF;
Chris@42 148 T14 = TA - TF;
Chris@42 149 TS = TQ - TR;
Chris@42 150 T10 = TQ + TR;
Chris@42 151 T1A = Tk * T1z;
Chris@42 152 T19 = FNMS(KP707106781, T14, T13);
Chris@42 153 T15 = FMA(KP707106781, T14, T13);
Chris@42 154 T1C = Tk * T1B;
Chris@42 155 }
Chris@42 156 T17 = FMA(KP707106781, T10, TZ);
Chris@42 157 T11 = FNMS(KP707106781, T10, TZ);
Chris@42 158 Rp[WS(rs, 1)] = FNMS(To, T1B, T1A);
Chris@42 159 T16 = Tg * T15;
Chris@42 160 Rm[WS(rs, 1)] = FMA(To, T1z, T1C);
Chris@42 161 }
Chris@42 162 T12 = Tg * T11;
Chris@42 163 {
Chris@42 164 E T1l, T1a, T1c, T18;
Chris@42 165 Im[WS(rs, 1)] = FMA(Tj, T11, T16);
Chris@42 166 Ip[WS(rs, 1)] = FNMS(Tj, T15, T12);
Chris@42 167 T18 = Tl * T17;
Chris@42 168 T1l = T1h - T1k;
Chris@42 169 T1a = Tl * T19;
Chris@42 170 T1c = T7 - Te;
Chris@42 171 Ip[WS(rs, 3)] = FNMS(Tp, T19, T18);
Chris@42 172 {
Chris@42 173 E T1s, T1m, T1d, T1y, TV;
Chris@42 174 Im[WS(rs, 3)] = FMA(Tp, T17, T1a);
Chris@42 175 T1m = T1e * T1c;
Chris@42 176 T1d = T1b * T1c;
Chris@42 177 T1s = T1o * T1r;
Chris@42 178 Rm[WS(rs, 2)] = FMA(T1b, T1l, T1m);
Chris@42 179 Rp[WS(rs, 2)] = FNMS(T1e, T1l, T1d);
Chris@42 180 Rp[WS(rs, 3)] = FNMS(T1u, T1x, T1s);
Chris@42 181 T1y = T1o * T1x;
Chris@42 182 TV = FMA(KP707106781, TG, Tv);
Chris@42 183 TH = FNMS(KP707106781, TG, Tv);
Chris@42 184 TT = FNMS(KP707106781, TS, TP);
Chris@42 185 TX = FMA(KP707106781, TS, TP);
Chris@42 186 Rm[WS(rs, 3)] = FMA(T1u, T1r, T1y);
Chris@42 187 TW = Tf * TV;
Chris@42 188 TY = Ti * TV;
Chris@42 189 }
Chris@42 190 }
Chris@42 191 }
Chris@42 192 }
Chris@42 193 }
Chris@42 194 Ip[0] = FNMS(Ti, TX, TW);
Chris@42 195 Im[0] = FMA(Tf, TX, TY);
Chris@42 196 TU = TK * TH;
Chris@42 197 TI = Tq * TH;
Chris@42 198 Im[WS(rs, 2)] = FMA(Tq, TT, TU);
Chris@42 199 Ip[WS(rs, 2)] = FNMS(TK, TT, TI);
Chris@42 200 }
Chris@42 201 }
Chris@42 202 }
Chris@42 203
Chris@42 204 static const tw_instr twinstr[] = {
Chris@42 205 {TW_CEXP, 1, 1},
Chris@42 206 {TW_CEXP, 1, 3},
Chris@42 207 {TW_CEXP, 1, 7},
Chris@42 208 {TW_NEXT, 1, 0}
Chris@42 209 };
Chris@42 210
Chris@42 211 static const hc2c_desc desc = { 8, "hc2cb2_8", twinstr, &GENUS, {44, 20, 30, 0} };
Chris@42 212
Chris@42 213 void X(codelet_hc2cb2_8) (planner *p) {
Chris@42 214 X(khc2c_register) (p, hc2cb2_8, &desc, HC2C_VIA_RDFT);
Chris@42 215 }
Chris@42 216 #else /* HAVE_FMA */
Chris@42 217
Chris@42 218 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hc2cb2_8 -include hc2cb.h */
Chris@42 219
Chris@42 220 /*
Chris@42 221 * This function contains 74 FP additions, 44 FP multiplications,
Chris@42 222 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@42 223 * 46 stack variables, 1 constants, and 32 memory accesses
Chris@42 224 */
Chris@42 225 #include "hc2cb.h"
Chris@42 226
Chris@42 227 static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 228 {
Chris@42 229 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 230 {
Chris@42 231 INT m;
Chris@42 232 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 233 E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
Chris@42 234 {
Chris@42 235 E Th, To, Tk, Tn;
Chris@42 236 Tf = W[0];
Chris@42 237 Ti = W[1];
Chris@42 238 Tg = W[2];
Chris@42 239 Tj = W[3];
Chris@42 240 Th = Tf * Tg;
Chris@42 241 To = Ti * Tg;
Chris@42 242 Tk = Ti * Tj;
Chris@42 243 Tn = Tf * Tj;
Chris@42 244 Tl = Th - Tk;
Chris@42 245 Tp = Tn + To;
Chris@42 246 TP = Th + Tk;
Chris@42 247 TR = Tn - To;
Chris@42 248 TF = W[4];
Chris@42 249 TG = W[5];
Chris@42 250 TH = FMA(Tf, TF, Ti * TG);
Chris@42 251 T15 = FNMS(TR, TF, TP * TG);
Chris@42 252 TL = FNMS(Ti, TF, Tf * TG);
Chris@42 253 TT = FMA(TP, TF, TR * TG);
Chris@42 254 }
Chris@42 255 {
Chris@42 256 E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
Chris@42 257 E TN, Tm, TE;
Chris@42 258 {
Chris@42 259 E T3, TU, Ts, T17, T6, T16, Tv, TV;
Chris@42 260 {
Chris@42 261 E T1, T2, Tq, Tr;
Chris@42 262 T1 = Rp[0];
Chris@42 263 T2 = Rm[WS(rs, 3)];
Chris@42 264 T3 = T1 + T2;
Chris@42 265 TU = T1 - T2;
Chris@42 266 Tq = Ip[0];
Chris@42 267 Tr = Im[WS(rs, 3)];
Chris@42 268 Ts = Tq - Tr;
Chris@42 269 T17 = Tq + Tr;
Chris@42 270 }
Chris@42 271 {
Chris@42 272 E T4, T5, Tt, Tu;
Chris@42 273 T4 = Rp[WS(rs, 2)];
Chris@42 274 T5 = Rm[WS(rs, 1)];
Chris@42 275 T6 = T4 + T5;
Chris@42 276 T16 = T4 - T5;
Chris@42 277 Tt = Ip[WS(rs, 2)];
Chris@42 278 Tu = Im[WS(rs, 1)];
Chris@42 279 Tv = Tt - Tu;
Chris@42 280 TV = Tt + Tu;
Chris@42 281 }
Chris@42 282 T7 = T3 + T6;
Chris@42 283 T1f = TU + TV;
Chris@42 284 T1i = T17 - T16;
Chris@42 285 Tw = Ts + Tv;
Chris@42 286 TI = T3 - T6;
Chris@42 287 TW = TU - TV;
Chris@42 288 T18 = T16 + T17;
Chris@42 289 TM = Ts - Tv;
Chris@42 290 }
Chris@42 291 {
Chris@42 292 E Ta, TX, Tz, TY, Td, T10, TC, T11;
Chris@42 293 {
Chris@42 294 E T8, T9, Tx, Ty;
Chris@42 295 T8 = Rp[WS(rs, 1)];
Chris@42 296 T9 = Rm[WS(rs, 2)];
Chris@42 297 Ta = T8 + T9;
Chris@42 298 TX = T8 - T9;
Chris@42 299 Tx = Ip[WS(rs, 1)];
Chris@42 300 Ty = Im[WS(rs, 2)];
Chris@42 301 Tz = Tx - Ty;
Chris@42 302 TY = Tx + Ty;
Chris@42 303 }
Chris@42 304 {
Chris@42 305 E Tb, Tc, TA, TB;
Chris@42 306 Tb = Rm[0];
Chris@42 307 Tc = Rp[WS(rs, 3)];
Chris@42 308 Td = Tb + Tc;
Chris@42 309 T10 = Tb - Tc;
Chris@42 310 TA = Ip[WS(rs, 3)];
Chris@42 311 TB = Im[0];
Chris@42 312 TC = TA - TB;
Chris@42 313 T11 = TA + TB;
Chris@42 314 }
Chris@42 315 Te = Ta + Td;
Chris@42 316 T19 = TX + TY;
Chris@42 317 T1a = T10 + T11;
Chris@42 318 TD = Tz + TC;
Chris@42 319 TJ = TC - Tz;
Chris@42 320 TZ = TX - TY;
Chris@42 321 T12 = T10 - T11;
Chris@42 322 TN = Ta - Td;
Chris@42 323 }
Chris@42 324 Rp[0] = T7 + Te;
Chris@42 325 Rm[0] = Tw + TD;
Chris@42 326 Tm = T7 - Te;
Chris@42 327 TE = Tw - TD;
Chris@42 328 Rp[WS(rs, 2)] = FNMS(Tp, TE, Tl * Tm);
Chris@42 329 Rm[WS(rs, 2)] = FMA(Tp, Tm, Tl * TE);
Chris@42 330 {
Chris@42 331 E TQ, TS, TK, TO;
Chris@42 332 TQ = TI + TJ;
Chris@42 333 TS = TN + TM;
Chris@42 334 Rp[WS(rs, 1)] = FNMS(TR, TS, TP * TQ);
Chris@42 335 Rm[WS(rs, 1)] = FMA(TP, TS, TR * TQ);
Chris@42 336 TK = TI - TJ;
Chris@42 337 TO = TM - TN;
Chris@42 338 Rp[WS(rs, 3)] = FNMS(TL, TO, TH * TK);
Chris@42 339 Rm[WS(rs, 3)] = FMA(TH, TO, TL * TK);
Chris@42 340 }
Chris@42 341 {
Chris@42 342 E T1h, T1l, T1k, T1m, T1g, T1j;
Chris@42 343 T1g = KP707106781 * (T19 + T1a);
Chris@42 344 T1h = T1f - T1g;
Chris@42 345 T1l = T1f + T1g;
Chris@42 346 T1j = KP707106781 * (TZ - T12);
Chris@42 347 T1k = T1i + T1j;
Chris@42 348 T1m = T1i - T1j;
Chris@42 349 Ip[WS(rs, 1)] = FNMS(Tj, T1k, Tg * T1h);
Chris@42 350 Im[WS(rs, 1)] = FMA(Tg, T1k, Tj * T1h);
Chris@42 351 Ip[WS(rs, 3)] = FNMS(TG, T1m, TF * T1l);
Chris@42 352 Im[WS(rs, 3)] = FMA(TF, T1m, TG * T1l);
Chris@42 353 }
Chris@42 354 {
Chris@42 355 E T14, T1d, T1c, T1e, T13, T1b;
Chris@42 356 T13 = KP707106781 * (TZ + T12);
Chris@42 357 T14 = TW - T13;
Chris@42 358 T1d = TW + T13;
Chris@42 359 T1b = KP707106781 * (T19 - T1a);
Chris@42 360 T1c = T18 - T1b;
Chris@42 361 T1e = T18 + T1b;
Chris@42 362 Ip[WS(rs, 2)] = FNMS(T15, T1c, TT * T14);
Chris@42 363 Im[WS(rs, 2)] = FMA(T15, T14, TT * T1c);
Chris@42 364 Ip[0] = FNMS(Ti, T1e, Tf * T1d);
Chris@42 365 Im[0] = FMA(Ti, T1d, Tf * T1e);
Chris@42 366 }
Chris@42 367 }
Chris@42 368 }
Chris@42 369 }
Chris@42 370 }
Chris@42 371
Chris@42 372 static const tw_instr twinstr[] = {
Chris@42 373 {TW_CEXP, 1, 1},
Chris@42 374 {TW_CEXP, 1, 3},
Chris@42 375 {TW_CEXP, 1, 7},
Chris@42 376 {TW_NEXT, 1, 0}
Chris@42 377 };
Chris@42 378
Chris@42 379 static const hc2c_desc desc = { 8, "hc2cb2_8", twinstr, &GENUS, {56, 26, 18, 0} };
Chris@42 380
Chris@42 381 void X(codelet_hc2cb2_8) (planner *p) {
Chris@42 382 X(khc2c_register) (p, hc2cb2_8, &desc, HC2C_VIA_RDFT);
Chris@42 383 }
Chris@42 384 #endif /* HAVE_FMA */