annotate src/fftw-3.3.5/rdft/scalar/r2cb/hb2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:50:10 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include hb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@42 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@42 33 * 77 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hb.h"
Chris@42 36
Chris@42 37 static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 43 E Tf, Tg, Tl, Tp, Ti, Tj, T1o, T1u, Tk, T1b, To, T1e, TK, Tq, T13;
Chris@42 44 E TP, T1p, T7, T1h, T1v, TZ, Tv, Tw, Ta, Tx, T1j, TE, TB, Td, Ty;
Chris@42 45 E Th, T1n, T1t;
Chris@42 46 Tf = W[0];
Chris@42 47 Tg = W[2];
Chris@42 48 Tl = W[4];
Chris@42 49 Tp = W[5];
Chris@42 50 Ti = W[1];
Chris@42 51 Th = Tf * Tg;
Chris@42 52 T1n = Tf * Tl;
Chris@42 53 T1t = Tf * Tp;
Chris@42 54 Tj = W[3];
Chris@42 55 {
Chris@42 56 E Tr, T3, Ts, T1f, TO, TL, T6, Tt;
Chris@42 57 {
Chris@42 58 E TM, TN, T4, T5;
Chris@42 59 {
Chris@42 60 E T1, Tn, T2, TJ, Tm;
Chris@42 61 T1 = cr[0];
Chris@42 62 T1o = FMA(Ti, Tp, T1n);
Chris@42 63 T1u = FNMS(Ti, Tl, T1t);
Chris@42 64 Tk = FMA(Ti, Tj, Th);
Chris@42 65 T1b = FNMS(Ti, Tj, Th);
Chris@42 66 Tn = Tf * Tj;
Chris@42 67 T2 = ci[WS(rs, 3)];
Chris@42 68 TM = ci[WS(rs, 7)];
Chris@42 69 TJ = Tk * Tp;
Chris@42 70 Tm = Tk * Tl;
Chris@42 71 To = FNMS(Ti, Tg, Tn);
Chris@42 72 T1e = FMA(Ti, Tg, Tn);
Chris@42 73 Tr = T1 - T2;
Chris@42 74 T3 = T1 + T2;
Chris@42 75 TK = FNMS(To, Tl, TJ);
Chris@42 76 Tq = FMA(To, Tp, Tm);
Chris@42 77 TN = cr[WS(rs, 4)];
Chris@42 78 }
Chris@42 79 T4 = cr[WS(rs, 2)];
Chris@42 80 T5 = ci[WS(rs, 1)];
Chris@42 81 Ts = ci[WS(rs, 5)];
Chris@42 82 T1f = TM - TN;
Chris@42 83 TO = TM + TN;
Chris@42 84 TL = T4 - T5;
Chris@42 85 T6 = T4 + T5;
Chris@42 86 Tt = cr[WS(rs, 6)];
Chris@42 87 }
Chris@42 88 {
Chris@42 89 E TC, TD, Tb, Tc;
Chris@42 90 {
Chris@42 91 E T8, T1g, Tu, T9;
Chris@42 92 T8 = cr[WS(rs, 1)];
Chris@42 93 T13 = TO - TL;
Chris@42 94 TP = TL + TO;
Chris@42 95 T1p = T3 - T6;
Chris@42 96 T7 = T3 + T6;
Chris@42 97 T1g = Ts - Tt;
Chris@42 98 Tu = Ts + Tt;
Chris@42 99 T9 = ci[WS(rs, 2)];
Chris@42 100 TC = ci[WS(rs, 4)];
Chris@42 101 T1h = T1f + T1g;
Chris@42 102 T1v = T1f - T1g;
Chris@42 103 TZ = Tr + Tu;
Chris@42 104 Tv = Tr - Tu;
Chris@42 105 Tw = T8 - T9;
Chris@42 106 Ta = T8 + T9;
Chris@42 107 TD = cr[WS(rs, 7)];
Chris@42 108 }
Chris@42 109 Tb = ci[0];
Chris@42 110 Tc = cr[WS(rs, 3)];
Chris@42 111 Tx = ci[WS(rs, 6)];
Chris@42 112 T1j = TC - TD;
Chris@42 113 TE = TC + TD;
Chris@42 114 TB = Tb - Tc;
Chris@42 115 Td = Tb + Tc;
Chris@42 116 Ty = cr[WS(rs, 5)];
Chris@42 117 }
Chris@42 118 }
Chris@42 119 {
Chris@42 120 E TR, TF, Te, T1w;
Chris@42 121 TR = TB + TE;
Chris@42 122 TF = TB - TE;
Chris@42 123 Te = Ta + Td;
Chris@42 124 T1w = Ta - Td;
Chris@42 125 {
Chris@42 126 E Tz, T1i, T1B, T1x, T1c;
Chris@42 127 Tz = Tx + Ty;
Chris@42 128 T1i = Tx - Ty;
Chris@42 129 T1B = T1w + T1v;
Chris@42 130 T1x = T1v - T1w;
Chris@42 131 T1c = T7 - Te;
Chris@42 132 cr[0] = T7 + Te;
Chris@42 133 {
Chris@42 134 E T1k, T1q, TQ, TA;
Chris@42 135 T1k = T1i + T1j;
Chris@42 136 T1q = T1j - T1i;
Chris@42 137 TQ = Tw + Tz;
Chris@42 138 TA = Tw - Tz;
Chris@42 139 {
Chris@42 140 E T1y, T1C, T1m, T1d;
Chris@42 141 T1y = T1o * T1x;
Chris@42 142 T1C = Tk * T1B;
Chris@42 143 T1m = T1e * T1c;
Chris@42 144 T1d = T1b * T1c;
Chris@42 145 {
Chris@42 146 E T1z, T1r, T1l, TG, T14;
Chris@42 147 T1z = T1p + T1q;
Chris@42 148 T1r = T1p - T1q;
Chris@42 149 T1l = T1h - T1k;
Chris@42 150 ci[0] = T1h + T1k;
Chris@42 151 TG = TA + TF;
Chris@42 152 T14 = TA - TF;
Chris@42 153 {
Chris@42 154 E T10, TS, T1s, T1A;
Chris@42 155 T10 = TQ + TR;
Chris@42 156 TS = TQ - TR;
Chris@42 157 ci[WS(rs, 6)] = FMA(T1u, T1r, T1y);
Chris@42 158 T1s = T1o * T1r;
Chris@42 159 ci[WS(rs, 2)] = FMA(To, T1z, T1C);
Chris@42 160 T1A = Tk * T1z;
Chris@42 161 ci[WS(rs, 4)] = FMA(T1b, T1l, T1m);
Chris@42 162 cr[WS(rs, 4)] = FNMS(T1e, T1l, T1d);
Chris@42 163 {
Chris@42 164 E T15, T19, TV, TH;
Chris@42 165 T15 = FMA(KP707106781, T14, T13);
Chris@42 166 T19 = FNMS(KP707106781, T14, T13);
Chris@42 167 TV = FMA(KP707106781, TG, Tv);
Chris@42 168 TH = FNMS(KP707106781, TG, Tv);
Chris@42 169 {
Chris@42 170 E TT, TX, T11, T17;
Chris@42 171 TT = FNMS(KP707106781, TS, TP);
Chris@42 172 TX = FMA(KP707106781, TS, TP);
Chris@42 173 T11 = FNMS(KP707106781, T10, TZ);
Chris@42 174 T17 = FMA(KP707106781, T10, TZ);
Chris@42 175 cr[WS(rs, 6)] = FNMS(T1u, T1x, T1s);
Chris@42 176 cr[WS(rs, 2)] = FNMS(To, T1B, T1A);
Chris@42 177 {
Chris@42 178 E T1a, T16, TU, TI;
Chris@42 179 T1a = Tl * T19;
Chris@42 180 T16 = Tg * T15;
Chris@42 181 TU = TK * TH;
Chris@42 182 TI = Tq * TH;
Chris@42 183 {
Chris@42 184 E TY, TW, T18, T12;
Chris@42 185 TY = Ti * TV;
Chris@42 186 TW = Tf * TV;
Chris@42 187 T18 = Tl * T17;
Chris@42 188 T12 = Tg * T11;
Chris@42 189 ci[WS(rs, 7)] = FMA(Tp, T17, T1a);
Chris@42 190 ci[WS(rs, 3)] = FMA(Tj, T11, T16);
Chris@42 191 ci[WS(rs, 5)] = FMA(Tq, TT, TU);
Chris@42 192 cr[WS(rs, 5)] = FNMS(TK, TT, TI);
Chris@42 193 ci[WS(rs, 1)] = FMA(Tf, TX, TY);
Chris@42 194 cr[WS(rs, 1)] = FNMS(Ti, TX, TW);
Chris@42 195 cr[WS(rs, 7)] = FNMS(Tp, T19, T18);
Chris@42 196 cr[WS(rs, 3)] = FNMS(Tj, T15, T12);
Chris@42 197 }
Chris@42 198 }
Chris@42 199 }
Chris@42 200 }
Chris@42 201 }
Chris@42 202 }
Chris@42 203 }
Chris@42 204 }
Chris@42 205 }
Chris@42 206 }
Chris@42 207 }
Chris@42 208 }
Chris@42 209 }
Chris@42 210
Chris@42 211 static const tw_instr twinstr[] = {
Chris@42 212 {TW_CEXP, 1, 1},
Chris@42 213 {TW_CEXP, 1, 3},
Chris@42 214 {TW_CEXP, 1, 7},
Chris@42 215 {TW_NEXT, 1, 0}
Chris@42 216 };
Chris@42 217
Chris@42 218 static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, {44, 20, 30, 0} };
Chris@42 219
Chris@42 220 void X(codelet_hb2_8) (planner *p) {
Chris@42 221 X(khc2hc_register) (p, hb2_8, &desc);
Chris@42 222 }
Chris@42 223 #else /* HAVE_FMA */
Chris@42 224
Chris@42 225 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 8 -dif -name hb2_8 -include hb.h */
Chris@42 226
Chris@42 227 /*
Chris@42 228 * This function contains 74 FP additions, 44 FP multiplications,
Chris@42 229 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@42 230 * 46 stack variables, 1 constants, and 32 memory accesses
Chris@42 231 */
Chris@42 232 #include "hb.h"
Chris@42 233
Chris@42 234 static void hb2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 235 {
Chris@42 236 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 237 {
Chris@42 238 INT m;
Chris@42 239 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 240 E Tf, Ti, Tg, Tj, Tl, Tp, TP, TR, TF, TG, TH, T15, TL, TT;
Chris@42 241 {
Chris@42 242 E Th, To, Tk, Tn;
Chris@42 243 Tf = W[0];
Chris@42 244 Ti = W[1];
Chris@42 245 Tg = W[2];
Chris@42 246 Tj = W[3];
Chris@42 247 Th = Tf * Tg;
Chris@42 248 To = Ti * Tg;
Chris@42 249 Tk = Ti * Tj;
Chris@42 250 Tn = Tf * Tj;
Chris@42 251 Tl = Th - Tk;
Chris@42 252 Tp = Tn + To;
Chris@42 253 TP = Th + Tk;
Chris@42 254 TR = Tn - To;
Chris@42 255 TF = W[4];
Chris@42 256 TG = W[5];
Chris@42 257 TH = FMA(Tf, TF, Ti * TG);
Chris@42 258 T15 = FNMS(TR, TF, TP * TG);
Chris@42 259 TL = FNMS(Ti, TF, Tf * TG);
Chris@42 260 TT = FMA(TP, TF, TR * TG);
Chris@42 261 }
Chris@42 262 {
Chris@42 263 E T7, T1f, T1i, Tw, TI, TW, T18, TM, Te, T19, T1a, TD, TJ, TZ, T12;
Chris@42 264 E TN, Tm, TE;
Chris@42 265 {
Chris@42 266 E T3, TU, Tv, TV, T6, T16, Ts, T17;
Chris@42 267 {
Chris@42 268 E T1, T2, Tt, Tu;
Chris@42 269 T1 = cr[0];
Chris@42 270 T2 = ci[WS(rs, 3)];
Chris@42 271 T3 = T1 + T2;
Chris@42 272 TU = T1 - T2;
Chris@42 273 Tt = ci[WS(rs, 5)];
Chris@42 274 Tu = cr[WS(rs, 6)];
Chris@42 275 Tv = Tt - Tu;
Chris@42 276 TV = Tt + Tu;
Chris@42 277 }
Chris@42 278 {
Chris@42 279 E T4, T5, Tq, Tr;
Chris@42 280 T4 = cr[WS(rs, 2)];
Chris@42 281 T5 = ci[WS(rs, 1)];
Chris@42 282 T6 = T4 + T5;
Chris@42 283 T16 = T4 - T5;
Chris@42 284 Tq = ci[WS(rs, 7)];
Chris@42 285 Tr = cr[WS(rs, 4)];
Chris@42 286 Ts = Tq - Tr;
Chris@42 287 T17 = Tq + Tr;
Chris@42 288 }
Chris@42 289 T7 = T3 + T6;
Chris@42 290 T1f = TU + TV;
Chris@42 291 T1i = T17 - T16;
Chris@42 292 Tw = Ts + Tv;
Chris@42 293 TI = T3 - T6;
Chris@42 294 TW = TU - TV;
Chris@42 295 T18 = T16 + T17;
Chris@42 296 TM = Ts - Tv;
Chris@42 297 }
Chris@42 298 {
Chris@42 299 E Ta, TX, TC, T11, Td, T10, Tz, TY;
Chris@42 300 {
Chris@42 301 E T8, T9, TA, TB;
Chris@42 302 T8 = cr[WS(rs, 1)];
Chris@42 303 T9 = ci[WS(rs, 2)];
Chris@42 304 Ta = T8 + T9;
Chris@42 305 TX = T8 - T9;
Chris@42 306 TA = ci[WS(rs, 4)];
Chris@42 307 TB = cr[WS(rs, 7)];
Chris@42 308 TC = TA - TB;
Chris@42 309 T11 = TA + TB;
Chris@42 310 }
Chris@42 311 {
Chris@42 312 E Tb, Tc, Tx, Ty;
Chris@42 313 Tb = ci[0];
Chris@42 314 Tc = cr[WS(rs, 3)];
Chris@42 315 Td = Tb + Tc;
Chris@42 316 T10 = Tb - Tc;
Chris@42 317 Tx = ci[WS(rs, 6)];
Chris@42 318 Ty = cr[WS(rs, 5)];
Chris@42 319 Tz = Tx - Ty;
Chris@42 320 TY = Tx + Ty;
Chris@42 321 }
Chris@42 322 Te = Ta + Td;
Chris@42 323 T19 = TX + TY;
Chris@42 324 T1a = T10 + T11;
Chris@42 325 TD = Tz + TC;
Chris@42 326 TJ = TC - Tz;
Chris@42 327 TZ = TX - TY;
Chris@42 328 T12 = T10 - T11;
Chris@42 329 TN = Ta - Td;
Chris@42 330 }
Chris@42 331 cr[0] = T7 + Te;
Chris@42 332 ci[0] = Tw + TD;
Chris@42 333 Tm = T7 - Te;
Chris@42 334 TE = Tw - TD;
Chris@42 335 cr[WS(rs, 4)] = FNMS(Tp, TE, Tl * Tm);
Chris@42 336 ci[WS(rs, 4)] = FMA(Tp, Tm, Tl * TE);
Chris@42 337 {
Chris@42 338 E TQ, TS, TK, TO;
Chris@42 339 TQ = TI + TJ;
Chris@42 340 TS = TN + TM;
Chris@42 341 cr[WS(rs, 2)] = FNMS(TR, TS, TP * TQ);
Chris@42 342 ci[WS(rs, 2)] = FMA(TP, TS, TR * TQ);
Chris@42 343 TK = TI - TJ;
Chris@42 344 TO = TM - TN;
Chris@42 345 cr[WS(rs, 6)] = FNMS(TL, TO, TH * TK);
Chris@42 346 ci[WS(rs, 6)] = FMA(TH, TO, TL * TK);
Chris@42 347 }
Chris@42 348 {
Chris@42 349 E T1h, T1l, T1k, T1m, T1g, T1j;
Chris@42 350 T1g = KP707106781 * (T19 + T1a);
Chris@42 351 T1h = T1f - T1g;
Chris@42 352 T1l = T1f + T1g;
Chris@42 353 T1j = KP707106781 * (TZ - T12);
Chris@42 354 T1k = T1i + T1j;
Chris@42 355 T1m = T1i - T1j;
Chris@42 356 cr[WS(rs, 3)] = FNMS(Tj, T1k, Tg * T1h);
Chris@42 357 ci[WS(rs, 3)] = FMA(Tg, T1k, Tj * T1h);
Chris@42 358 cr[WS(rs, 7)] = FNMS(TG, T1m, TF * T1l);
Chris@42 359 ci[WS(rs, 7)] = FMA(TF, T1m, TG * T1l);
Chris@42 360 }
Chris@42 361 {
Chris@42 362 E T14, T1d, T1c, T1e, T13, T1b;
Chris@42 363 T13 = KP707106781 * (TZ + T12);
Chris@42 364 T14 = TW - T13;
Chris@42 365 T1d = TW + T13;
Chris@42 366 T1b = KP707106781 * (T19 - T1a);
Chris@42 367 T1c = T18 - T1b;
Chris@42 368 T1e = T18 + T1b;
Chris@42 369 cr[WS(rs, 5)] = FNMS(T15, T1c, TT * T14);
Chris@42 370 ci[WS(rs, 5)] = FMA(T15, T14, TT * T1c);
Chris@42 371 cr[WS(rs, 1)] = FNMS(Ti, T1e, Tf * T1d);
Chris@42 372 ci[WS(rs, 1)] = FMA(Ti, T1d, Tf * T1e);
Chris@42 373 }
Chris@42 374 }
Chris@42 375 }
Chris@42 376 }
Chris@42 377 }
Chris@42 378
Chris@42 379 static const tw_instr twinstr[] = {
Chris@42 380 {TW_CEXP, 1, 1},
Chris@42 381 {TW_CEXP, 1, 3},
Chris@42 382 {TW_CEXP, 1, 7},
Chris@42 383 {TW_NEXT, 1, 0}
Chris@42 384 };
Chris@42 385
Chris@42 386 static const hc2hc_desc desc = { 8, "hb2_8", twinstr, &GENUS, {56, 26, 18, 0} };
Chris@42 387
Chris@42 388 void X(codelet_hb2_8) (planner *p) {
Chris@42 389 X(khc2hc_register) (p, hb2_8, &desc);
Chris@42 390 }
Chris@42 391 #endif /* HAVE_FMA */