annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf2_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:48 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 74 FP additions, 50 FP multiplications,
Chris@42 32 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
Chris@42 33 * 64 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 43 E TS, T1l, TJ, T1m, T1k, Tw, T1w, T1u;
Chris@42 44 {
Chris@42 45 E T2, T3, Tl, Tn, T5, T4, Tm, Tr, T6;
Chris@42 46 T2 = W[0];
Chris@42 47 T3 = W[2];
Chris@42 48 Tl = W[4];
Chris@42 49 Tn = W[5];
Chris@42 50 T5 = W[1];
Chris@42 51 T4 = T2 * T3;
Chris@42 52 Tm = T2 * Tl;
Chris@42 53 Tr = T2 * Tn;
Chris@42 54 T6 = W[3];
Chris@42 55 {
Chris@42 56 E T1, T1s, TG, Td, T1r, Tu, TY, Tk, TW, T18, T1d, TD, TH, TA, T13;
Chris@42 57 E TE, T14;
Chris@42 58 {
Chris@42 59 E To, Ts, Tf, T7, T8, Ti, Tb, T9, Tc, TC, Ta, TF, TB, Tg, Th;
Chris@42 60 E Tj;
Chris@42 61 T1 = cr[0];
Chris@42 62 To = FMA(T5, Tn, Tm);
Chris@42 63 Ts = FNMS(T5, Tl, Tr);
Chris@42 64 Tf = FMA(T5, T6, T4);
Chris@42 65 T7 = FNMS(T5, T6, T4);
Chris@42 66 Ta = T2 * T6;
Chris@42 67 T1s = ci[0];
Chris@42 68 T8 = cr[WS(rs, 4)];
Chris@42 69 TF = Tf * Tn;
Chris@42 70 TB = Tf * Tl;
Chris@42 71 Ti = FNMS(T5, T3, Ta);
Chris@42 72 Tb = FMA(T5, T3, Ta);
Chris@42 73 T9 = T7 * T8;
Chris@42 74 Tc = ci[WS(rs, 4)];
Chris@42 75 TG = FNMS(Ti, Tl, TF);
Chris@42 76 TC = FMA(Ti, Tn, TB);
Chris@42 77 {
Chris@42 78 E Tp, T1q, Tt, Tq, TX;
Chris@42 79 Tp = cr[WS(rs, 6)];
Chris@42 80 Td = FMA(Tb, Tc, T9);
Chris@42 81 T1q = T7 * Tc;
Chris@42 82 Tt = ci[WS(rs, 6)];
Chris@42 83 Tq = To * Tp;
Chris@42 84 Tg = cr[WS(rs, 2)];
Chris@42 85 T1r = FNMS(Tb, T8, T1q);
Chris@42 86 TX = To * Tt;
Chris@42 87 Tu = FMA(Ts, Tt, Tq);
Chris@42 88 Th = Tf * Tg;
Chris@42 89 Tj = ci[WS(rs, 2)];
Chris@42 90 TY = FNMS(Ts, Tp, TX);
Chris@42 91 }
Chris@42 92 {
Chris@42 93 E TO, TQ, TN, TP, T1a, T1b;
Chris@42 94 {
Chris@42 95 E TK, TM, TL, T19, TV;
Chris@42 96 TK = cr[WS(rs, 7)];
Chris@42 97 TM = ci[WS(rs, 7)];
Chris@42 98 Tk = FMA(Ti, Tj, Th);
Chris@42 99 TV = Tf * Tj;
Chris@42 100 TL = Tl * TK;
Chris@42 101 T19 = Tl * TM;
Chris@42 102 TO = cr[WS(rs, 3)];
Chris@42 103 TW = FNMS(Ti, Tg, TV);
Chris@42 104 TQ = ci[WS(rs, 3)];
Chris@42 105 TN = FMA(Tn, TM, TL);
Chris@42 106 TP = T3 * TO;
Chris@42 107 T1a = FNMS(Tn, TK, T19);
Chris@42 108 T1b = T3 * TQ;
Chris@42 109 }
Chris@42 110 {
Chris@42 111 E Tx, Tz, Ty, T12, T1c, TR;
Chris@42 112 Tx = cr[WS(rs, 1)];
Chris@42 113 TR = FMA(T6, TQ, TP);
Chris@42 114 Tz = ci[WS(rs, 1)];
Chris@42 115 T1c = FNMS(T6, TO, T1b);
Chris@42 116 Ty = T2 * Tx;
Chris@42 117 T18 = TN - TR;
Chris@42 118 TS = TN + TR;
Chris@42 119 T12 = T2 * Tz;
Chris@42 120 T1d = T1a - T1c;
Chris@42 121 T1l = T1a + T1c;
Chris@42 122 TD = cr[WS(rs, 5)];
Chris@42 123 TH = ci[WS(rs, 5)];
Chris@42 124 TA = FMA(T5, Tz, Ty);
Chris@42 125 T13 = FNMS(T5, Tx, T12);
Chris@42 126 TE = TC * TD;
Chris@42 127 T14 = TC * TH;
Chris@42 128 }
Chris@42 129 }
Chris@42 130 }
Chris@42 131 {
Chris@42 132 E Te, T1p, Tv, T1t;
Chris@42 133 {
Chris@42 134 E T1g, T10, T1z, T1B, T1C, T1j, T1A, T1f;
Chris@42 135 {
Chris@42 136 E T1x, T11, T16, T1y;
Chris@42 137 {
Chris@42 138 E TU, TZ, TI, T15;
Chris@42 139 Te = T1 + Td;
Chris@42 140 TU = T1 - Td;
Chris@42 141 TZ = TW - TY;
Chris@42 142 T1p = TW + TY;
Chris@42 143 TI = FMA(TG, TH, TE);
Chris@42 144 T15 = FNMS(TG, TD, T14);
Chris@42 145 Tv = Tk + Tu;
Chris@42 146 T1x = Tk - Tu;
Chris@42 147 T1g = TU - TZ;
Chris@42 148 T10 = TU + TZ;
Chris@42 149 T11 = TA - TI;
Chris@42 150 TJ = TA + TI;
Chris@42 151 T1m = T13 + T15;
Chris@42 152 T16 = T13 - T15;
Chris@42 153 T1y = T1s - T1r;
Chris@42 154 T1t = T1r + T1s;
Chris@42 155 }
Chris@42 156 {
Chris@42 157 E T1i, T1e, T17, T1h;
Chris@42 158 T1i = T18 + T1d;
Chris@42 159 T1e = T18 - T1d;
Chris@42 160 T17 = T11 + T16;
Chris@42 161 T1h = T11 - T16;
Chris@42 162 T1z = T1x + T1y;
Chris@42 163 T1B = T1y - T1x;
Chris@42 164 T1C = T1i - T1h;
Chris@42 165 T1j = T1h + T1i;
Chris@42 166 T1A = T1e - T17;
Chris@42 167 T1f = T17 + T1e;
Chris@42 168 }
Chris@42 169 }
Chris@42 170 cr[WS(rs, 3)] = FNMS(KP707106781, T1j, T1g);
Chris@42 171 cr[WS(rs, 7)] = FMS(KP707106781, T1A, T1z);
Chris@42 172 cr[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
Chris@42 173 ci[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
Chris@42 174 ci[WS(rs, 6)] = FMA(KP707106781, T1C, T1B);
Chris@42 175 cr[WS(rs, 5)] = FMS(KP707106781, T1C, T1B);
Chris@42 176 ci[WS(rs, 4)] = FMA(KP707106781, T1A, T1z);
Chris@42 177 ci[0] = FMA(KP707106781, T1j, T1g);
Chris@42 178 }
Chris@42 179 T1k = Te - Tv;
Chris@42 180 Tw = Te + Tv;
Chris@42 181 T1w = T1t - T1p;
Chris@42 182 T1u = T1p + T1t;
Chris@42 183 }
Chris@42 184 }
Chris@42 185 }
Chris@42 186 {
Chris@42 187 E TT, T1v, T1n, T1o;
Chris@42 188 TT = TJ + TS;
Chris@42 189 T1v = TS - TJ;
Chris@42 190 T1n = T1l - T1m;
Chris@42 191 T1o = T1m + T1l;
Chris@42 192 ci[WS(rs, 5)] = T1v + T1w;
Chris@42 193 cr[WS(rs, 6)] = T1v - T1w;
Chris@42 194 cr[0] = Tw + TT;
Chris@42 195 ci[WS(rs, 3)] = Tw - TT;
Chris@42 196 ci[WS(rs, 7)] = T1o + T1u;
Chris@42 197 cr[WS(rs, 4)] = T1o - T1u;
Chris@42 198 ci[WS(rs, 1)] = T1k + T1n;
Chris@42 199 cr[WS(rs, 2)] = T1k - T1n;
Chris@42 200 }
Chris@42 201 }
Chris@42 202 }
Chris@42 203 }
Chris@42 204
Chris@42 205 static const tw_instr twinstr[] = {
Chris@42 206 {TW_CEXP, 1, 1},
Chris@42 207 {TW_CEXP, 1, 3},
Chris@42 208 {TW_CEXP, 1, 7},
Chris@42 209 {TW_NEXT, 1, 0}
Chris@42 210 };
Chris@42 211
Chris@42 212 static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, {44, 20, 30, 0} };
Chris@42 213
Chris@42 214 void X(codelet_hf2_8) (planner *p) {
Chris@42 215 X(khc2hc_register) (p, hf2_8, &desc);
Chris@42 216 }
Chris@42 217 #else /* HAVE_FMA */
Chris@42 218
Chris@42 219 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include hf.h */
Chris@42 220
Chris@42 221 /*
Chris@42 222 * This function contains 74 FP additions, 44 FP multiplications,
Chris@42 223 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
Chris@42 224 * 42 stack variables, 1 constants, and 32 memory accesses
Chris@42 225 */
Chris@42 226 #include "hf.h"
Chris@42 227
Chris@42 228 static void hf2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 229 {
Chris@42 230 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 231 {
Chris@42 232 INT m;
Chris@42 233 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 234 E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
Chris@42 235 {
Chris@42 236 E T4, Tb, T7, Ta;
Chris@42 237 T2 = W[0];
Chris@42 238 T5 = W[1];
Chris@42 239 T3 = W[2];
Chris@42 240 T6 = W[3];
Chris@42 241 T4 = T2 * T3;
Chris@42 242 Tb = T5 * T3;
Chris@42 243 T7 = T5 * T6;
Chris@42 244 Ta = T2 * T6;
Chris@42 245 T8 = T4 - T7;
Chris@42 246 Tc = Ta + Tb;
Chris@42 247 Tg = T4 + T7;
Chris@42 248 Ti = Ta - Tb;
Chris@42 249 Tl = W[4];
Chris@42 250 Tm = W[5];
Chris@42 251 Tn = FMA(T2, Tl, T5 * Tm);
Chris@42 252 Tz = FNMS(Ti, Tl, Tg * Tm);
Chris@42 253 Tp = FNMS(T5, Tl, T2 * Tm);
Chris@42 254 Tx = FMA(Tg, Tl, Ti * Tm);
Chris@42 255 }
Chris@42 256 {
Chris@42 257 E Tf, T1j, TL, T1d, TJ, T16, TV, TY, Ts, T1i, TO, T1a, TC, T17, TQ;
Chris@42 258 E TT;
Chris@42 259 {
Chris@42 260 E T1, T1c, Te, T1b, T9, Td;
Chris@42 261 T1 = cr[0];
Chris@42 262 T1c = ci[0];
Chris@42 263 T9 = cr[WS(rs, 4)];
Chris@42 264 Td = ci[WS(rs, 4)];
Chris@42 265 Te = FMA(T8, T9, Tc * Td);
Chris@42 266 T1b = FNMS(Tc, T9, T8 * Td);
Chris@42 267 Tf = T1 + Te;
Chris@42 268 T1j = T1c - T1b;
Chris@42 269 TL = T1 - Te;
Chris@42 270 T1d = T1b + T1c;
Chris@42 271 }
Chris@42 272 {
Chris@42 273 E TF, TW, TI, TX;
Chris@42 274 {
Chris@42 275 E TD, TE, TG, TH;
Chris@42 276 TD = cr[WS(rs, 7)];
Chris@42 277 TE = ci[WS(rs, 7)];
Chris@42 278 TF = FMA(Tl, TD, Tm * TE);
Chris@42 279 TW = FNMS(Tm, TD, Tl * TE);
Chris@42 280 TG = cr[WS(rs, 3)];
Chris@42 281 TH = ci[WS(rs, 3)];
Chris@42 282 TI = FMA(T3, TG, T6 * TH);
Chris@42 283 TX = FNMS(T6, TG, T3 * TH);
Chris@42 284 }
Chris@42 285 TJ = TF + TI;
Chris@42 286 T16 = TW + TX;
Chris@42 287 TV = TF - TI;
Chris@42 288 TY = TW - TX;
Chris@42 289 }
Chris@42 290 {
Chris@42 291 E Tk, TM, Tr, TN;
Chris@42 292 {
Chris@42 293 E Th, Tj, To, Tq;
Chris@42 294 Th = cr[WS(rs, 2)];
Chris@42 295 Tj = ci[WS(rs, 2)];
Chris@42 296 Tk = FMA(Tg, Th, Ti * Tj);
Chris@42 297 TM = FNMS(Ti, Th, Tg * Tj);
Chris@42 298 To = cr[WS(rs, 6)];
Chris@42 299 Tq = ci[WS(rs, 6)];
Chris@42 300 Tr = FMA(Tn, To, Tp * Tq);
Chris@42 301 TN = FNMS(Tp, To, Tn * Tq);
Chris@42 302 }
Chris@42 303 Ts = Tk + Tr;
Chris@42 304 T1i = Tk - Tr;
Chris@42 305 TO = TM - TN;
Chris@42 306 T1a = TM + TN;
Chris@42 307 }
Chris@42 308 {
Chris@42 309 E Tw, TR, TB, TS;
Chris@42 310 {
Chris@42 311 E Tu, Tv, Ty, TA;
Chris@42 312 Tu = cr[WS(rs, 1)];
Chris@42 313 Tv = ci[WS(rs, 1)];
Chris@42 314 Tw = FMA(T2, Tu, T5 * Tv);
Chris@42 315 TR = FNMS(T5, Tu, T2 * Tv);
Chris@42 316 Ty = cr[WS(rs, 5)];
Chris@42 317 TA = ci[WS(rs, 5)];
Chris@42 318 TB = FMA(Tx, Ty, Tz * TA);
Chris@42 319 TS = FNMS(Tz, Ty, Tx * TA);
Chris@42 320 }
Chris@42 321 TC = Tw + TB;
Chris@42 322 T17 = TR + TS;
Chris@42 323 TQ = Tw - TB;
Chris@42 324 TT = TR - TS;
Chris@42 325 }
Chris@42 326 {
Chris@42 327 E Tt, TK, T1f, T1g;
Chris@42 328 Tt = Tf + Ts;
Chris@42 329 TK = TC + TJ;
Chris@42 330 ci[WS(rs, 3)] = Tt - TK;
Chris@42 331 cr[0] = Tt + TK;
Chris@42 332 T1f = TJ - TC;
Chris@42 333 T1g = T1d - T1a;
Chris@42 334 cr[WS(rs, 6)] = T1f - T1g;
Chris@42 335 ci[WS(rs, 5)] = T1f + T1g;
Chris@42 336 {
Chris@42 337 E T11, T1m, T14, T1l, T12, T13;
Chris@42 338 T11 = TL - TO;
Chris@42 339 T1m = T1j - T1i;
Chris@42 340 T12 = TQ - TT;
Chris@42 341 T13 = TV + TY;
Chris@42 342 T14 = KP707106781 * (T12 + T13);
Chris@42 343 T1l = KP707106781 * (T13 - T12);
Chris@42 344 cr[WS(rs, 3)] = T11 - T14;
Chris@42 345 ci[WS(rs, 6)] = T1l + T1m;
Chris@42 346 ci[0] = T11 + T14;
Chris@42 347 cr[WS(rs, 5)] = T1l - T1m;
Chris@42 348 }
Chris@42 349 }
Chris@42 350 {
Chris@42 351 E T19, T1e, T15, T18;
Chris@42 352 T19 = T17 + T16;
Chris@42 353 T1e = T1a + T1d;
Chris@42 354 cr[WS(rs, 4)] = T19 - T1e;
Chris@42 355 ci[WS(rs, 7)] = T19 + T1e;
Chris@42 356 T15 = Tf - Ts;
Chris@42 357 T18 = T16 - T17;
Chris@42 358 cr[WS(rs, 2)] = T15 - T18;
Chris@42 359 ci[WS(rs, 1)] = T15 + T18;
Chris@42 360 {
Chris@42 361 E TP, T1k, T10, T1h, TU, TZ;
Chris@42 362 TP = TL + TO;
Chris@42 363 T1k = T1i + T1j;
Chris@42 364 TU = TQ + TT;
Chris@42 365 TZ = TV - TY;
Chris@42 366 T10 = KP707106781 * (TU + TZ);
Chris@42 367 T1h = KP707106781 * (TZ - TU);
Chris@42 368 ci[WS(rs, 2)] = TP - T10;
Chris@42 369 ci[WS(rs, 4)] = T1h + T1k;
Chris@42 370 cr[WS(rs, 1)] = TP + T10;
Chris@42 371 cr[WS(rs, 7)] = T1h - T1k;
Chris@42 372 }
Chris@42 373 }
Chris@42 374 }
Chris@42 375 }
Chris@42 376 }
Chris@42 377 }
Chris@42 378
Chris@42 379 static const tw_instr twinstr[] = {
Chris@42 380 {TW_CEXP, 1, 1},
Chris@42 381 {TW_CEXP, 1, 3},
Chris@42 382 {TW_CEXP, 1, 7},
Chris@42 383 {TW_NEXT, 1, 0}
Chris@42 384 };
Chris@42 385
Chris@42 386 static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, {56, 26, 18, 0} };
Chris@42 387
Chris@42 388 void X(codelet_hf2_8) (planner *p) {
Chris@42 389 X(khc2hc_register) (p, hf2_8, &desc);
Chris@42 390 }
Chris@42 391 #endif /* HAVE_FMA */