annotate src/fftw-3.3.8/rdft/scalar/r2cb/hb2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:37 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include rdft/scalar/hb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@82 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@82 33 * 47 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hb.h"
Chris@82 36
Chris@82 37 static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 43 E Tf, Tg, Tl, Tp, Ti, Tj, Tk, T1b, T1u, T1e, T1o, To, Tq, TK;
Chris@82 44 {
Chris@82 45 E Th, T1n, T1t, Tn, Tm, TJ;
Chris@82 46 Tf = W[0];
Chris@82 47 Tg = W[2];
Chris@82 48 Th = Tf * Tg;
Chris@82 49 Tl = W[4];
Chris@82 50 T1n = Tf * Tl;
Chris@82 51 Tp = W[5];
Chris@82 52 T1t = Tf * Tp;
Chris@82 53 Ti = W[1];
Chris@82 54 Tj = W[3];
Chris@82 55 Tn = Tf * Tj;
Chris@82 56 Tk = FMA(Ti, Tj, Th);
Chris@82 57 T1b = FNMS(Ti, Tj, Th);
Chris@82 58 T1u = FNMS(Ti, Tl, T1t);
Chris@82 59 T1e = FMA(Ti, Tg, Tn);
Chris@82 60 T1o = FMA(Ti, Tp, T1n);
Chris@82 61 Tm = Tk * Tl;
Chris@82 62 TJ = Tk * Tp;
Chris@82 63 To = FNMS(Ti, Tg, Tn);
Chris@82 64 Tq = FMA(To, Tp, Tm);
Chris@82 65 TK = FNMS(To, Tl, TJ);
Chris@82 66 }
Chris@82 67 {
Chris@82 68 E T7, T1p, T1v, Tv, TP, T13, T1h, TZ, Te, T1k, T1w, T1q, TQ, TR, T10;
Chris@82 69 E TG, T14;
Chris@82 70 {
Chris@82 71 E T3, Tr, TO, T1f, T6, TL, Tu, T1g;
Chris@82 72 {
Chris@82 73 E T1, T2, TM, TN;
Chris@82 74 T1 = cr[0];
Chris@82 75 T2 = ci[WS(rs, 3)];
Chris@82 76 T3 = T1 + T2;
Chris@82 77 Tr = T1 - T2;
Chris@82 78 TM = ci[WS(rs, 7)];
Chris@82 79 TN = cr[WS(rs, 4)];
Chris@82 80 TO = TM + TN;
Chris@82 81 T1f = TM - TN;
Chris@82 82 }
Chris@82 83 {
Chris@82 84 E T4, T5, Ts, Tt;
Chris@82 85 T4 = cr[WS(rs, 2)];
Chris@82 86 T5 = ci[WS(rs, 1)];
Chris@82 87 T6 = T4 + T5;
Chris@82 88 TL = T4 - T5;
Chris@82 89 Ts = ci[WS(rs, 5)];
Chris@82 90 Tt = cr[WS(rs, 6)];
Chris@82 91 Tu = Ts + Tt;
Chris@82 92 T1g = Ts - Tt;
Chris@82 93 }
Chris@82 94 T7 = T3 + T6;
Chris@82 95 T1p = T3 - T6;
Chris@82 96 T1v = T1f - T1g;
Chris@82 97 Tv = Tr - Tu;
Chris@82 98 TP = TL + TO;
Chris@82 99 T13 = TO - TL;
Chris@82 100 T1h = T1f + T1g;
Chris@82 101 TZ = Tr + Tu;
Chris@82 102 }
Chris@82 103 {
Chris@82 104 E Ta, Tw, TE, T1j, Td, TB, Tz, T1i, TA, TF;
Chris@82 105 {
Chris@82 106 E T8, T9, TC, TD;
Chris@82 107 T8 = cr[WS(rs, 1)];
Chris@82 108 T9 = ci[WS(rs, 2)];
Chris@82 109 Ta = T8 + T9;
Chris@82 110 Tw = T8 - T9;
Chris@82 111 TC = ci[WS(rs, 4)];
Chris@82 112 TD = cr[WS(rs, 7)];
Chris@82 113 TE = TC + TD;
Chris@82 114 T1j = TC - TD;
Chris@82 115 }
Chris@82 116 {
Chris@82 117 E Tb, Tc, Tx, Ty;
Chris@82 118 Tb = ci[0];
Chris@82 119 Tc = cr[WS(rs, 3)];
Chris@82 120 Td = Tb + Tc;
Chris@82 121 TB = Tb - Tc;
Chris@82 122 Tx = ci[WS(rs, 6)];
Chris@82 123 Ty = cr[WS(rs, 5)];
Chris@82 124 Tz = Tx + Ty;
Chris@82 125 T1i = Tx - Ty;
Chris@82 126 }
Chris@82 127 Te = Ta + Td;
Chris@82 128 T1k = T1i + T1j;
Chris@82 129 T1w = Ta - Td;
Chris@82 130 T1q = T1j - T1i;
Chris@82 131 TQ = Tw + Tz;
Chris@82 132 TR = TB + TE;
Chris@82 133 T10 = TQ + TR;
Chris@82 134 TA = Tw - Tz;
Chris@82 135 TF = TB - TE;
Chris@82 136 TG = TA + TF;
Chris@82 137 T14 = TA - TF;
Chris@82 138 }
Chris@82 139 cr[0] = T7 + Te;
Chris@82 140 ci[0] = T1h + T1k;
Chris@82 141 {
Chris@82 142 E T11, T12, T15, T16;
Chris@82 143 T11 = FNMS(KP707106781, T10, TZ);
Chris@82 144 T12 = Tg * T11;
Chris@82 145 T15 = FMA(KP707106781, T14, T13);
Chris@82 146 T16 = Tg * T15;
Chris@82 147 cr[WS(rs, 3)] = FNMS(Tj, T15, T12);
Chris@82 148 ci[WS(rs, 3)] = FMA(Tj, T11, T16);
Chris@82 149 }
Chris@82 150 {
Chris@82 151 E T1z, T1A, T1B, T1C;
Chris@82 152 T1z = T1p + T1q;
Chris@82 153 T1A = Tk * T1z;
Chris@82 154 T1B = T1w + T1v;
Chris@82 155 T1C = Tk * T1B;
Chris@82 156 cr[WS(rs, 2)] = FNMS(To, T1B, T1A);
Chris@82 157 ci[WS(rs, 2)] = FMA(To, T1z, T1C);
Chris@82 158 }
Chris@82 159 {
Chris@82 160 E T17, T18, T19, T1a;
Chris@82 161 T17 = FMA(KP707106781, T10, TZ);
Chris@82 162 T18 = Tl * T17;
Chris@82 163 T19 = FNMS(KP707106781, T14, T13);
Chris@82 164 T1a = Tl * T19;
Chris@82 165 cr[WS(rs, 7)] = FNMS(Tp, T19, T18);
Chris@82 166 ci[WS(rs, 7)] = FMA(Tp, T17, T1a);
Chris@82 167 }
Chris@82 168 {
Chris@82 169 E T1l, T1d, T1m, T1c;
Chris@82 170 T1l = T1h - T1k;
Chris@82 171 T1c = T7 - Te;
Chris@82 172 T1d = T1b * T1c;
Chris@82 173 T1m = T1e * T1c;
Chris@82 174 cr[WS(rs, 4)] = FNMS(T1e, T1l, T1d);
Chris@82 175 ci[WS(rs, 4)] = FMA(T1b, T1l, T1m);
Chris@82 176 }
Chris@82 177 {
Chris@82 178 E T1r, T1s, T1x, T1y;
Chris@82 179 T1r = T1p - T1q;
Chris@82 180 T1s = T1o * T1r;
Chris@82 181 T1x = T1v - T1w;
Chris@82 182 T1y = T1o * T1x;
Chris@82 183 cr[WS(rs, 6)] = FNMS(T1u, T1x, T1s);
Chris@82 184 ci[WS(rs, 6)] = FMA(T1u, T1r, T1y);
Chris@82 185 }
Chris@82 186 {
Chris@82 187 E TT, TX, TW, TY, TI, TU, TS, TV, TH;
Chris@82 188 TS = TQ - TR;
Chris@82 189 TT = FNMS(KP707106781, TS, TP);
Chris@82 190 TX = FMA(KP707106781, TS, TP);
Chris@82 191 TV = FMA(KP707106781, TG, Tv);
Chris@82 192 TW = Tf * TV;
Chris@82 193 TY = Ti * TV;
Chris@82 194 TH = FNMS(KP707106781, TG, Tv);
Chris@82 195 TI = Tq * TH;
Chris@82 196 TU = TK * TH;
Chris@82 197 cr[WS(rs, 5)] = FNMS(TK, TT, TI);
Chris@82 198 ci[WS(rs, 5)] = FMA(Tq, TT, TU);
Chris@82 199 cr[WS(rs, 1)] = FNMS(Ti, TX, TW);
Chris@82 200 ci[WS(rs, 1)] = FMA(Tf, TX, TY);
Chris@82 201 }
Chris@82 202 }
Chris@82 203 }
Chris@82 204 }
Chris@82 205 }
Chris@82 206
Chris@82 207 static const tw_instr twinstr[] = {
Chris@82 208 {TW_CEXP, 1, 1},
Chris@82 209 {TW_CEXP, 1, 3},
Chris@82 210 {TW_CEXP, 1, 7},
Chris@82 211 {TW_NEXT, 1, 0}
Chris@82 212 };
Chris@82 213
Chris@82 214 static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, {44, 20, 30, 0} };
Chris@82 215
Chris@82 216 void X(codelet_hb2_8) (planner *p) {
Chris@82 217 X(khc2hc_register) (p, hb2_8, &desc);
Chris@82 218 }
Chris@82 219 #else
Chris@82 220
Chris@82 221 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include rdft/scalar/hb.h */
Chris@82 222
Chris@82 223 /*
Chris@82 224 * This function contains 74 FP additions, 44 FP multiplications,
Chris@82 225 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@82 226 * 46 stack variables, 1 constants, and 32 memory accesses
Chris@82 227 */
Chris@82 228 #include "rdft/scalar/hb.h"
Chris@82 229
Chris@82 230 static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 231 {
Chris@82 232 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 233 {
Chris@82 234 INT m;
Chris@82 235 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 236 E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
Chris@82 237 {
Chris@82 238 E Th, To, Tk, Tn;
Chris@82 239 Tf = W[0];
Chris@82 240 Ti = W[1];
Chris@82 241 Tg = W[2];
Chris@82 242 Tj = W[3];
Chris@82 243 Th = Tf * Tg;
Chris@82 244 To = Ti * Tg;
Chris@82 245 Tk = Ti * Tj;
Chris@82 246 Tn = Tf * Tj;
Chris@82 247 Tl = Th - Tk;
Chris@82 248 Tp = Tn + To;
Chris@82 249 TP = Th + Tk;
Chris@82 250 TR = Tn - To;
Chris@82 251 TF = W[4];
Chris@82 252 TG = W[5];
Chris@82 253 TH = FMA(Tf, TF, Ti * TG);
Chris@82 254 T15 = FNMS(TR, TF, TP * TG);
Chris@82 255 TL = FNMS(Ti, TF, Tf * TG);
Chris@82 256 TT = FMA(TP, TF, TR * TG);
Chris@82 257 }
Chris@82 258 {
Chris@82 259 E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
Chris@82 260 E TN, Tm, TE;
Chris@82 261 {
Chris@82 262 E T3, TU, Tv, TV, T6, T16, Ts, T17;
Chris@82 263 {
Chris@82 264 E T1, T2, Tt, Tu;
Chris@82 265 T1 = cr[0];
Chris@82 266 T2 = ci[WS(rs, 3)];
Chris@82 267 T3 = T1 + T2;
Chris@82 268 TU = T1 - T2;
Chris@82 269 Tt = ci[WS(rs, 5)];
Chris@82 270 Tu = cr[WS(rs, 6)];
Chris@82 271 Tv = Tt - Tu;
Chris@82 272 TV = Tt + Tu;
Chris@82 273 }
Chris@82 274 {
Chris@82 275 E T4, T5, Tq, Tr;
Chris@82 276 T4 = cr[WS(rs, 2)];
Chris@82 277 T5 = ci[WS(rs, 1)];
Chris@82 278 T6 = T4 + T5;
Chris@82 279 T16 = T4 - T5;
Chris@82 280 Tq = ci[WS(rs, 7)];
Chris@82 281 Tr = cr[WS(rs, 4)];
Chris@82 282 Ts = Tq - Tr;
Chris@82 283 T17 = Tq + Tr;
Chris@82 284 }
Chris@82 285 T7 = T3 + T6;
Chris@82 286 T1f = TU + TV;
Chris@82 287 T1i = T17 - T16;
Chris@82 288 Tw = Ts + Tv;
Chris@82 289 TI = T3 - T6;
Chris@82 290 TW = TU - TV;
Chris@82 291 T18 = T16 + T17;
Chris@82 292 TM = Ts - Tv;
Chris@82 293 }
Chris@82 294 {
Chris@82 295 E Ta, TX, TC, T11, Td, T10, Tz, TY;
Chris@82 296 {
Chris@82 297 E T8, T9, TA, TB;
Chris@82 298 T8 = cr[WS(rs, 1)];
Chris@82 299 T9 = ci[WS(rs, 2)];
Chris@82 300 Ta = T8 + T9;
Chris@82 301 TX = T8 - T9;
Chris@82 302 TA = ci[WS(rs, 4)];
Chris@82 303 TB = cr[WS(rs, 7)];
Chris@82 304 TC = TA - TB;
Chris@82 305 T11 = TA + TB;
Chris@82 306 }
Chris@82 307 {
Chris@82 308 E Tb, Tc, Tx, Ty;
Chris@82 309 Tb = ci[0];
Chris@82 310 Tc = cr[WS(rs, 3)];
Chris@82 311 Td = Tb + Tc;
Chris@82 312 T10 = Tb - Tc;
Chris@82 313 Tx = ci[WS(rs, 6)];
Chris@82 314 Ty = cr[WS(rs, 5)];
Chris@82 315 Tz = Tx - Ty;
Chris@82 316 TY = Tx + Ty;
Chris@82 317 }
Chris@82 318 Te = Ta + Td;
Chris@82 319 T19 = TX + TY;
Chris@82 320 T1a = T10 + T11;
Chris@82 321 TD = Tz + TC;
Chris@82 322 TJ = TC - Tz;
Chris@82 323 TZ = TX - TY;
Chris@82 324 T12 = T10 - T11;
Chris@82 325 TN = Ta - Td;
Chris@82 326 }
Chris@82 327 cr[0] = T7 + Te;
Chris@82 328 ci[0] = Tw + TD;
Chris@82 329 Tm = T7 - Te;
Chris@82 330 TE = Tw - TD;
Chris@82 331 cr[WS(rs, 4)] = FNMS(Tp, TE, Tl * Tm);
Chris@82 332 ci[WS(rs, 4)] = FMA(Tp, Tm, Tl * TE);
Chris@82 333 {
Chris@82 334 E TQ, TS, TK, TO;
Chris@82 335 TQ = TI + TJ;
Chris@82 336 TS = TN + TM;
Chris@82 337 cr[WS(rs, 2)] = FNMS(TR, TS, TP * TQ);
Chris@82 338 ci[WS(rs, 2)] = FMA(TP, TS, TR * TQ);
Chris@82 339 TK = TI - TJ;
Chris@82 340 TO = TM - TN;
Chris@82 341 cr[WS(rs, 6)] = FNMS(TL, TO, TH * TK);
Chris@82 342 ci[WS(rs, 6)] = FMA(TH, TO, TL * TK);
Chris@82 343 }
Chris@82 344 {
Chris@82 345 E T1h, T1l, T1k, T1m, T1g, T1j;
Chris@82 346 T1g = KP707106781 * (T19 + T1a);
Chris@82 347 T1h = T1f - T1g;
Chris@82 348 T1l = T1f + T1g;
Chris@82 349 T1j = KP707106781 * (TZ - T12);
Chris@82 350 T1k = T1i + T1j;
Chris@82 351 T1m = T1i - T1j;
Chris@82 352 cr[WS(rs, 3)] = FNMS(Tj, T1k, Tg * T1h);
Chris@82 353 ci[WS(rs, 3)] = FMA(Tg, T1k, Tj * T1h);
Chris@82 354 cr[WS(rs, 7)] = FNMS(TG, T1m, TF * T1l);
Chris@82 355 ci[WS(rs, 7)] = FMA(TF, T1m, TG * T1l);
Chris@82 356 }
Chris@82 357 {
Chris@82 358 E T14, T1d, T1c, T1e, T13, T1b;
Chris@82 359 T13 = KP707106781 * (TZ + T12);
Chris@82 360 T14 = TW - T13;
Chris@82 361 T1d = TW + T13;
Chris@82 362 T1b = KP707106781 * (T19 - T1a);
Chris@82 363 T1c = T18 - T1b;
Chris@82 364 T1e = T18 + T1b;
Chris@82 365 cr[WS(rs, 5)] = FNMS(T15, T1c, TT * T14);
Chris@82 366 ci[WS(rs, 5)] = FMA(T15, T14, TT * T1c);
Chris@82 367 cr[WS(rs, 1)] = FNMS(Ti, T1e, Tf * T1d);
Chris@82 368 ci[WS(rs, 1)] = FMA(Ti, T1d, Tf * T1e);
Chris@82 369 }
Chris@82 370 }
Chris@82 371 }
Chris@82 372 }
Chris@82 373 }
Chris@82 374
Chris@82 375 static const tw_instr twinstr[] = {
Chris@82 376 {TW_CEXP, 1, 1},
Chris@82 377 {TW_CEXP, 1, 3},
Chris@82 378 {TW_CEXP, 1, 7},
Chris@82 379 {TW_NEXT, 1, 0}
Chris@82 380 };
Chris@82 381
Chris@82 382 static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, {56, 26, 18, 0} };
Chris@82 383
Chris@82 384 void X(codelet_hb2_8) (planner *p) {
Chris@82 385 X(khc2hc_register) (p, hb2_8, &desc);
Chris@82 386 }
Chris@82 387 #endif