annotate src/fftw-3.3.8/rdft/scalar/r2cf/hc2cf2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:08 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include rdft/scalar/hc2cf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@82 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@82 33 * 48 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cf.h"
Chris@82 36
Chris@82 37 static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 43 E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
Chris@82 44 {
Chris@82 45 E T4, Tm, Tr, Ta, TB, TF;
Chris@82 46 T2 = W[0];
Chris@82 47 T3 = W[2];
Chris@82 48 T4 = T2 * T3;
Chris@82 49 Tl = W[4];
Chris@82 50 Tm = T2 * Tl;
Chris@82 51 Tn = W[5];
Chris@82 52 Tr = T2 * Tn;
Chris@82 53 T5 = W[1];
Chris@82 54 T6 = W[3];
Chris@82 55 Ta = T2 * T6;
Chris@82 56 Tf = FMA(T5, T6, T4);
Chris@82 57 T7 = FNMS(T5, T6, T4);
Chris@82 58 Ts = FNMS(T5, Tl, Tr);
Chris@82 59 Tb = FMA(T5, T3, Ta);
Chris@82 60 To = FMA(T5, Tn, Tm);
Chris@82 61 TB = Tf * Tl;
Chris@82 62 TF = Tf * Tn;
Chris@82 63 Ti = FNMS(T5, T3, Ta);
Chris@82 64 TC = FMA(Ti, Tn, TB);
Chris@82 65 TG = FNMS(Ti, Tl, TF);
Chris@82 66 }
Chris@82 67 {
Chris@82 68 E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
Chris@82 69 E TI, T11, T13, T15, T16;
Chris@82 70 T1 = Rp[0];
Chris@82 71 T1s = Rm[0];
Chris@82 72 {
Chris@82 73 E T8, T9, Tc, T1q;
Chris@82 74 T8 = Rp[WS(rs, 2)];
Chris@82 75 T9 = T7 * T8;
Chris@82 76 Tc = Rm[WS(rs, 2)];
Chris@82 77 T1q = T7 * Tc;
Chris@82 78 Td = FMA(Tb, Tc, T9);
Chris@82 79 T1r = FNMS(Tb, T8, T1q);
Chris@82 80 }
Chris@82 81 {
Chris@82 82 E Tp, Tq, Tt, TX;
Chris@82 83 Tp = Rp[WS(rs, 3)];
Chris@82 84 Tq = To * Tp;
Chris@82 85 Tt = Rm[WS(rs, 3)];
Chris@82 86 TX = To * Tt;
Chris@82 87 Tu = FMA(Ts, Tt, Tq);
Chris@82 88 TY = FNMS(Ts, Tp, TX);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 E Tg, Th, Tj, TV;
Chris@82 92 Tg = Rp[WS(rs, 1)];
Chris@82 93 Th = Tf * Tg;
Chris@82 94 Tj = Rm[WS(rs, 1)];
Chris@82 95 TV = Tf * Tj;
Chris@82 96 Tk = FMA(Ti, Tj, Th);
Chris@82 97 TW = FNMS(Ti, Tg, TV);
Chris@82 98 }
Chris@82 99 {
Chris@82 100 E TK, TL, TM, T19, TO, TP, TQ, T1b;
Chris@82 101 TK = Ip[WS(rs, 3)];
Chris@82 102 TL = Tl * TK;
Chris@82 103 TM = Im[WS(rs, 3)];
Chris@82 104 T19 = Tl * TM;
Chris@82 105 TO = Ip[WS(rs, 1)];
Chris@82 106 TP = T3 * TO;
Chris@82 107 TQ = Im[WS(rs, 1)];
Chris@82 108 T1b = T3 * TQ;
Chris@82 109 TN = FMA(Tn, TM, TL);
Chris@82 110 TR = FMA(T6, TQ, TP);
Chris@82 111 T18 = TN - TR;
Chris@82 112 T1a = FNMS(Tn, TK, T19);
Chris@82 113 T1c = FNMS(T6, TO, T1b);
Chris@82 114 T1d = T1a - T1c;
Chris@82 115 }
Chris@82 116 {
Chris@82 117 E Tx, Ty, Tz, T12, TD, TE, TH, T14;
Chris@82 118 Tx = Ip[0];
Chris@82 119 Ty = T2 * Tx;
Chris@82 120 Tz = Im[0];
Chris@82 121 T12 = T2 * Tz;
Chris@82 122 TD = Ip[WS(rs, 2)];
Chris@82 123 TE = TC * TD;
Chris@82 124 TH = Im[WS(rs, 2)];
Chris@82 125 T14 = TC * TH;
Chris@82 126 TA = FMA(T5, Tz, Ty);
Chris@82 127 TI = FMA(TG, TH, TE);
Chris@82 128 T11 = TA - TI;
Chris@82 129 T13 = FNMS(T5, Tx, T12);
Chris@82 130 T15 = FNMS(TG, TD, T14);
Chris@82 131 T16 = T13 - T15;
Chris@82 132 }
Chris@82 133 {
Chris@82 134 E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
Chris@82 135 {
Chris@82 136 E TU, TZ, T1x, T1y;
Chris@82 137 TU = T1 - Td;
Chris@82 138 TZ = TW - TY;
Chris@82 139 T10 = TU + TZ;
Chris@82 140 T1g = TU - TZ;
Chris@82 141 T1x = T1s - T1r;
Chris@82 142 T1y = Tk - Tu;
Chris@82 143 T1z = T1x - T1y;
Chris@82 144 T1B = T1y + T1x;
Chris@82 145 }
Chris@82 146 {
Chris@82 147 E T17, T1e, T1h, T1i;
Chris@82 148 T17 = T11 + T16;
Chris@82 149 T1e = T18 - T1d;
Chris@82 150 T1f = T17 + T1e;
Chris@82 151 T1C = T1e - T17;
Chris@82 152 T1h = T16 - T11;
Chris@82 153 T1i = T18 + T1d;
Chris@82 154 T1j = T1h - T1i;
Chris@82 155 T1A = T1h + T1i;
Chris@82 156 }
Chris@82 157 Rm[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
Chris@82 158 Im[WS(rs, 2)] = FMS(KP707106781, T1A, T1z);
Chris@82 159 Rp[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
Chris@82 160 Ip[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
Chris@82 161 Rm[0] = FNMS(KP707106781, T1j, T1g);
Chris@82 162 Im[0] = FMS(KP707106781, T1C, T1B);
Chris@82 163 Rp[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
Chris@82 164 Ip[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
Chris@82 165 }
Chris@82 166 {
Chris@82 167 E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
Chris@82 168 {
Chris@82 169 E Te, Tv, T1p, T1t;
Chris@82 170 Te = T1 + Td;
Chris@82 171 Tv = Tk + Tu;
Chris@82 172 Tw = Te + Tv;
Chris@82 173 T1k = Te - Tv;
Chris@82 174 T1p = TW + TY;
Chris@82 175 T1t = T1r + T1s;
Chris@82 176 T1u = T1p + T1t;
Chris@82 177 T1w = T1t - T1p;
Chris@82 178 }
Chris@82 179 {
Chris@82 180 E TJ, TS, T1l, T1m;
Chris@82 181 TJ = TA + TI;
Chris@82 182 TS = TN + TR;
Chris@82 183 TT = TJ + TS;
Chris@82 184 T1v = TS - TJ;
Chris@82 185 T1l = T13 + T15;
Chris@82 186 T1m = T1a + T1c;
Chris@82 187 T1n = T1l - T1m;
Chris@82 188 T1o = T1l + T1m;
Chris@82 189 }
Chris@82 190 Rm[WS(rs, 3)] = Tw - TT;
Chris@82 191 Im[WS(rs, 3)] = T1o - T1u;
Chris@82 192 Rp[0] = Tw + TT;
Chris@82 193 Ip[0] = T1o + T1u;
Chris@82 194 Rm[WS(rs, 1)] = T1k - T1n;
Chris@82 195 Im[WS(rs, 1)] = T1v - T1w;
Chris@82 196 Rp[WS(rs, 2)] = T1k + T1n;
Chris@82 197 Ip[WS(rs, 2)] = T1v + T1w;
Chris@82 198 }
Chris@82 199 }
Chris@82 200 }
Chris@82 201 }
Chris@82 202 }
Chris@82 203
Chris@82 204 static const tw_instr twinstr[] = {
Chris@82 205 {TW_CEXP, 1, 1},
Chris@82 206 {TW_CEXP, 1, 3},
Chris@82 207 {TW_CEXP, 1, 7},
Chris@82 208 {TW_NEXT, 1, 0}
Chris@82 209 };
Chris@82 210
Chris@82 211 static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, {44, 20, 30, 0} };
Chris@82 212
Chris@82 213 void X(codelet_hc2cf2_8) (planner *p) {
Chris@82 214 X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
Chris@82 215 }
Chris@82 216 #else
Chris@82 217
Chris@82 218 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include rdft/scalar/hc2cf.h */
Chris@82 219
Chris@82 220 /*
Chris@82 221 * This function contains 74 FP additions, 44 FP multiplications,
Chris@82 222 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@82 223 * 42 stack variables, 1 constants, and 32 memory accesses
Chris@82 224 */
Chris@82 225 #include "rdft/scalar/hc2cf.h"
Chris@82 226
Chris@82 227 static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 228 {
Chris@82 229 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 230 {
Chris@82 231 INT m;
Chris@82 232 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 233 E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
Chris@82 234 {
Chris@82 235 E T4, Tb, T7, Ta;
Chris@82 236 T2 = W[0];
Chris@82 237 T5 = W[1];
Chris@82 238 T3 = W[2];
Chris@82 239 T6 = W[3];
Chris@82 240 T4 = T2 * T3;
Chris@82 241 Tb = T5 * T3;
Chris@82 242 T7 = T5 * T6;
Chris@82 243 Ta = T2 * T6;
Chris@82 244 T8 = T4 - T7;
Chris@82 245 Tc = Ta + Tb;
Chris@82 246 Tg = T4 + T7;
Chris@82 247 Ti = Ta - Tb;
Chris@82 248 Tl = W[4];
Chris@82 249 Tm = W[5];
Chris@82 250 Tn = FMA(T2, Tl, T5 * Tm);
Chris@82 251 Tz = FNMS(Ti, Tl, Tg * Tm);
Chris@82 252 Tp = FNMS(T5, Tl, T2 * Tm);
Chris@82 253 Tx = FMA(Tg, Tl, Ti * Tm);
Chris@82 254 }
Chris@82 255 {
Chris@82 256 E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
Chris@82 257 E TT;
Chris@82 258 {
Chris@82 259 E T1, T1c, Te, T1b, T9, Td;
Chris@82 260 T1 = Rp[0];
Chris@82 261 T1c = Rm[0];
Chris@82 262 T9 = Rp[WS(rs, 2)];
Chris@82 263 Td = Rm[WS(rs, 2)];
Chris@82 264 Te = FMA(T8, T9, Tc * Td);
Chris@82 265 T1b = FNMS(Tc, T9, T8 * Td);
Chris@82 266 Tf = T1 + Te;
Chris@82 267 T1i = T1c - T1b;
Chris@82 268 TL = T1 - Te;
Chris@82 269 T1d = T1b + T1c;
Chris@82 270 }
Chris@82 271 {
Chris@82 272 E TF, TW, TI, TX;
Chris@82 273 {
Chris@82 274 E TD, TE, TG, TH;
Chris@82 275 TD = Ip[WS(rs, 3)];
Chris@82 276 TE = Im[WS(rs, 3)];
Chris@82 277 TF = FMA(Tl, TD, Tm * TE);
Chris@82 278 TW = FNMS(Tm, TD, Tl * TE);
Chris@82 279 TG = Ip[WS(rs, 1)];
Chris@82 280 TH = Im[WS(rs, 1)];
Chris@82 281 TI = FMA(T3, TG, T6 * TH);
Chris@82 282 TX = FNMS(T6, TG, T3 * TH);
Chris@82 283 }
Chris@82 284 TJ = TF + TI;
Chris@82 285 T17 = TW + TX;
Chris@82 286 TV = TF - TI;
Chris@82 287 TY = TW - TX;
Chris@82 288 }
Chris@82 289 {
Chris@82 290 E Tk, TM, Tr, TN;
Chris@82 291 {
Chris@82 292 E Th, Tj, To, Tq;
Chris@82 293 Th = Rp[WS(rs, 1)];
Chris@82 294 Tj = Rm[WS(rs, 1)];
Chris@82 295 Tk = FMA(Tg, Th, Ti * Tj);
Chris@82 296 TM = FNMS(Ti, Th, Tg * Tj);
Chris@82 297 To = Rp[WS(rs, 3)];
Chris@82 298 Tq = Rm[WS(rs, 3)];
Chris@82 299 Tr = FMA(Tn, To, Tp * Tq);
Chris@82 300 TN = FNMS(Tp, To, Tn * Tq);
Chris@82 301 }
Chris@82 302 Ts = Tk + Tr;
Chris@82 303 T1j = Tk - Tr;
Chris@82 304 TO = TM - TN;
Chris@82 305 T1a = TM + TN;
Chris@82 306 }
Chris@82 307 {
Chris@82 308 E Tw, TR, TB, TS;
Chris@82 309 {
Chris@82 310 E Tu, Tv, Ty, TA;
Chris@82 311 Tu = Ip[0];
Chris@82 312 Tv = Im[0];
Chris@82 313 Tw = FMA(T2, Tu, T5 * Tv);
Chris@82 314 TR = FNMS(T5, Tu, T2 * Tv);
Chris@82 315 Ty = Ip[WS(rs, 2)];
Chris@82 316 TA = Im[WS(rs, 2)];
Chris@82 317 TB = FMA(Tx, Ty, Tz * TA);
Chris@82 318 TS = FNMS(Tz, Ty, Tx * TA);
Chris@82 319 }
Chris@82 320 TC = Tw + TB;
Chris@82 321 T16 = TR + TS;
Chris@82 322 TQ = Tw - TB;
Chris@82 323 TT = TR - TS;
Chris@82 324 }
Chris@82 325 {
Chris@82 326 E Tt, TK, T1f, T1g;
Chris@82 327 Tt = Tf + Ts;
Chris@82 328 TK = TC + TJ;
Chris@82 329 Rm[WS(rs, 3)] = Tt - TK;
Chris@82 330 Rp[0] = Tt + TK;
Chris@82 331 {
Chris@82 332 E T19, T1e, T15, T18;
Chris@82 333 T19 = T16 + T17;
Chris@82 334 T1e = T1a + T1d;
Chris@82 335 Im[WS(rs, 3)] = T19 - T1e;
Chris@82 336 Ip[0] = T19 + T1e;
Chris@82 337 T15 = Tf - Ts;
Chris@82 338 T18 = T16 - T17;
Chris@82 339 Rm[WS(rs, 1)] = T15 - T18;
Chris@82 340 Rp[WS(rs, 2)] = T15 + T18;
Chris@82 341 }
Chris@82 342 T1f = TJ - TC;
Chris@82 343 T1g = T1d - T1a;
Chris@82 344 Im[WS(rs, 1)] = T1f - T1g;
Chris@82 345 Ip[WS(rs, 2)] = T1f + T1g;
Chris@82 346 {
Chris@82 347 E T11, T1k, T14, T1h, T12, T13;
Chris@82 348 T11 = TL - TO;
Chris@82 349 T1k = T1i - T1j;
Chris@82 350 T12 = TT - TQ;
Chris@82 351 T13 = TV + TY;
Chris@82 352 T14 = KP707106781 * (T12 - T13);
Chris@82 353 T1h = KP707106781 * (T12 + T13);
Chris@82 354 Rm[0] = T11 - T14;
Chris@82 355 Ip[WS(rs, 1)] = T1h + T1k;
Chris@82 356 Rp[WS(rs, 3)] = T11 + T14;
Chris@82 357 Im[WS(rs, 2)] = T1h - T1k;
Chris@82 358 }
Chris@82 359 {
Chris@82 360 E TP, T1m, T10, T1l, TU, TZ;
Chris@82 361 TP = TL + TO;
Chris@82 362 T1m = T1j + T1i;
Chris@82 363 TU = TQ + TT;
Chris@82 364 TZ = TV - TY;
Chris@82 365 T10 = KP707106781 * (TU + TZ);
Chris@82 366 T1l = KP707106781 * (TZ - TU);
Chris@82 367 Rm[WS(rs, 2)] = TP - T10;
Chris@82 368 Ip[WS(rs, 3)] = T1l + T1m;
Chris@82 369 Rp[WS(rs, 1)] = TP + T10;
Chris@82 370 Im[0] = T1l - T1m;
Chris@82 371 }
Chris@82 372 }
Chris@82 373 }
Chris@82 374 }
Chris@82 375 }
Chris@82 376 }
Chris@82 377
Chris@82 378 static const tw_instr twinstr[] = {
Chris@82 379 {TW_CEXP, 1, 1},
Chris@82 380 {TW_CEXP, 1, 3},
Chris@82 381 {TW_CEXP, 1, 7},
Chris@82 382 {TW_NEXT, 1, 0}
Chris@82 383 };
Chris@82 384
Chris@82 385 static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, {56, 26, 18, 0} };
Chris@82 386
Chris@82 387 void X(codelet_hc2cf2_8) (planner *p) {
Chris@82 388 X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
Chris@82 389 }
Chris@82 390 #endif