annotate src/fftw-3.3.8/rdft/scalar/r2cf/hf2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:35 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include rdft/scalar/hf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@82 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@82 33 * 90 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hf.h"
Chris@82 36
Chris@82 37 static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 45 E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
Chris@82 46 E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
Chris@82 47 {
Chris@82 48 E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
Chris@82 49 T2 = W[0];
Chris@82 50 Tf = W[2];
Chris@82 51 Tg = T2 * Tf;
Chris@82 52 TM = W[6];
Chris@82 53 TN = T2 * TM;
Chris@82 54 TO = W[7];
Chris@82 55 TS = T2 * TO;
Chris@82 56 T3 = W[4];
Chris@82 57 T4 = T2 * T3;
Chris@82 58 Tp = Tf * T3;
Chris@82 59 T6 = W[5];
Chris@82 60 Ta = T2 * T6;
Chris@82 61 Tt = Tf * T6;
Chris@82 62 T5 = W[1];
Chris@82 63 Th = W[3];
Chris@82 64 Tl = T2 * Th;
Chris@82 65 Tz = FMA(T5, Th, Tg);
Chris@82 66 Ti = FNMS(T5, Th, Tg);
Chris@82 67 T7 = FMA(T5, T6, T4);
Chris@82 68 TZ = FNMS(Th, T3, Tt);
Chris@82 69 TT = FNMS(T5, TM, TS);
Chris@82 70 Tq = FNMS(Th, T6, Tp);
Chris@82 71 TW = FMA(Th, T6, Tp);
Chris@82 72 Tb = FNMS(T5, T3, Ta);
Chris@82 73 Tu = FMA(Th, T3, Tt);
Chris@82 74 TP = FMA(T5, TO, TN);
Chris@82 75 TI = FMA(T5, T3, Ta);
Chris@82 76 TF = FNMS(T5, T6, T4);
Chris@82 77 {
Chris@82 78 E T1y, T1C, T1e, T1i;
Chris@82 79 T1y = Tz * T3;
Chris@82 80 T1C = Tz * T6;
Chris@82 81 TC = FNMS(T5, Tf, Tl);
Chris@82 82 T1z = FMA(TC, T6, T1y);
Chris@82 83 T1O = FMA(TC, T3, T1C);
Chris@82 84 T1D = FNMS(TC, T3, T1C);
Chris@82 85 T1L = FNMS(TC, T6, T1y);
Chris@82 86 T1e = Ti * T3;
Chris@82 87 T1i = Ti * T6;
Chris@82 88 Tm = FMA(T5, Tf, Tl);
Chris@82 89 T1f = FMA(Tm, T6, T1e);
Chris@82 90 T1p = FMA(Tm, T3, T1i);
Chris@82 91 T1j = FNMS(Tm, T3, T1i);
Chris@82 92 T1m = FNMS(Tm, T6, T1e);
Chris@82 93 }
Chris@82 94 }
Chris@82 95 {
Chris@82 96 E Te, T1U, T3A, T3M, T1G, T2w, T2I, T3h, T1R, T2D, T2B, T3i, Tx, T3L, T1Z;
Chris@82 97 E T3w, TL, T21, T26, T38, T1d, T2h, T2s, T3c, T1s, T2t, T2m, T3d, T12, T28;
Chris@82 98 E T2d, T37;
Chris@82 99 {
Chris@82 100 E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
Chris@82 101 T1 = cr[0];
Chris@82 102 T3z = ci[0];
Chris@82 103 T8 = cr[WS(rs, 8)];
Chris@82 104 T9 = T7 * T8;
Chris@82 105 Tc = ci[WS(rs, 8)];
Chris@82 106 T3x = T7 * Tc;
Chris@82 107 Td = FMA(Tb, Tc, T9);
Chris@82 108 Te = T1 + Td;
Chris@82 109 T1U = T1 - Td;
Chris@82 110 T3y = FNMS(Tb, T8, T3x);
Chris@82 111 T3A = T3y + T3z;
Chris@82 112 T3M = T3z - T3y;
Chris@82 113 }
Chris@82 114 {
Chris@82 115 E T1u, T1v, T1w, T2E, T1A, T1B, T1E, T2G;
Chris@82 116 T1u = cr[WS(rs, 15)];
Chris@82 117 T1v = TM * T1u;
Chris@82 118 T1w = ci[WS(rs, 15)];
Chris@82 119 T2E = TM * T1w;
Chris@82 120 T1A = cr[WS(rs, 7)];
Chris@82 121 T1B = T1z * T1A;
Chris@82 122 T1E = ci[WS(rs, 7)];
Chris@82 123 T2G = T1z * T1E;
Chris@82 124 {
Chris@82 125 E T1x, T1F, T2F, T2H;
Chris@82 126 T1x = FMA(TO, T1w, T1v);
Chris@82 127 T1F = FMA(T1D, T1E, T1B);
Chris@82 128 T1G = T1x + T1F;
Chris@82 129 T2w = T1x - T1F;
Chris@82 130 T2F = FNMS(TO, T1u, T2E);
Chris@82 131 T2H = FNMS(T1D, T1A, T2G);
Chris@82 132 T2I = T2F - T2H;
Chris@82 133 T3h = T2F + T2H;
Chris@82 134 }
Chris@82 135 }
Chris@82 136 {
Chris@82 137 E T1H, T1I, T1J, T2x, T1M, T1N, T1P, T2z;
Chris@82 138 T1H = cr[WS(rs, 3)];
Chris@82 139 T1I = Tf * T1H;
Chris@82 140 T1J = ci[WS(rs, 3)];
Chris@82 141 T2x = Tf * T1J;
Chris@82 142 T1M = cr[WS(rs, 11)];
Chris@82 143 T1N = T1L * T1M;
Chris@82 144 T1P = ci[WS(rs, 11)];
Chris@82 145 T2z = T1L * T1P;
Chris@82 146 {
Chris@82 147 E T1K, T1Q, T2y, T2A;
Chris@82 148 T1K = FMA(Th, T1J, T1I);
Chris@82 149 T1Q = FMA(T1O, T1P, T1N);
Chris@82 150 T1R = T1K + T1Q;
Chris@82 151 T2D = T1Q - T1K;
Chris@82 152 T2y = FNMS(Th, T1H, T2x);
Chris@82 153 T2A = FNMS(T1O, T1M, T2z);
Chris@82 154 T2B = T2y - T2A;
Chris@82 155 T3i = T2y + T2A;
Chris@82 156 }
Chris@82 157 }
Chris@82 158 {
Chris@82 159 E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
Chris@82 160 Tj = cr[WS(rs, 4)];
Chris@82 161 Tk = Ti * Tj;
Chris@82 162 Tn = ci[WS(rs, 4)];
Chris@82 163 T1V = Ti * Tn;
Chris@82 164 Tr = cr[WS(rs, 12)];
Chris@82 165 Ts = Tq * Tr;
Chris@82 166 Tv = ci[WS(rs, 12)];
Chris@82 167 T1X = Tq * Tv;
Chris@82 168 {
Chris@82 169 E To, Tw, T1W, T1Y;
Chris@82 170 To = FMA(Tm, Tn, Tk);
Chris@82 171 Tw = FMA(Tu, Tv, Ts);
Chris@82 172 Tx = To + Tw;
Chris@82 173 T3L = To - Tw;
Chris@82 174 T1W = FNMS(Tm, Tj, T1V);
Chris@82 175 T1Y = FNMS(Tu, Tr, T1X);
Chris@82 176 T1Z = T1W - T1Y;
Chris@82 177 T3w = T1W + T1Y;
Chris@82 178 }
Chris@82 179 }
Chris@82 180 {
Chris@82 181 E TA, TB, TD, T22, TG, TH, TJ, T24;
Chris@82 182 TA = cr[WS(rs, 2)];
Chris@82 183 TB = Tz * TA;
Chris@82 184 TD = ci[WS(rs, 2)];
Chris@82 185 T22 = Tz * TD;
Chris@82 186 TG = cr[WS(rs, 10)];
Chris@82 187 TH = TF * TG;
Chris@82 188 TJ = ci[WS(rs, 10)];
Chris@82 189 T24 = TF * TJ;
Chris@82 190 {
Chris@82 191 E TE, TK, T23, T25;
Chris@82 192 TE = FMA(TC, TD, TB);
Chris@82 193 TK = FMA(TI, TJ, TH);
Chris@82 194 TL = TE + TK;
Chris@82 195 T21 = TE - TK;
Chris@82 196 T23 = FNMS(TC, TA, T22);
Chris@82 197 T25 = FNMS(TI, TG, T24);
Chris@82 198 T26 = T23 - T25;
Chris@82 199 T38 = T23 + T25;
Chris@82 200 }
Chris@82 201 }
Chris@82 202 {
Chris@82 203 E T15, T16, T17, T2o, T19, T1a, T1b, T2q;
Chris@82 204 T15 = cr[WS(rs, 1)];
Chris@82 205 T16 = T2 * T15;
Chris@82 206 T17 = ci[WS(rs, 1)];
Chris@82 207 T2o = T2 * T17;
Chris@82 208 T19 = cr[WS(rs, 9)];
Chris@82 209 T1a = T3 * T19;
Chris@82 210 T1b = ci[WS(rs, 9)];
Chris@82 211 T2q = T3 * T1b;
Chris@82 212 {
Chris@82 213 E T18, T1c, T2p, T2r;
Chris@82 214 T18 = FMA(T5, T17, T16);
Chris@82 215 T1c = FMA(T6, T1b, T1a);
Chris@82 216 T1d = T18 + T1c;
Chris@82 217 T2h = T18 - T1c;
Chris@82 218 T2p = FNMS(T5, T15, T2o);
Chris@82 219 T2r = FNMS(T6, T19, T2q);
Chris@82 220 T2s = T2p - T2r;
Chris@82 221 T3c = T2p + T2r;
Chris@82 222 }
Chris@82 223 }
Chris@82 224 {
Chris@82 225 E T1g, T1h, T1k, T2i, T1n, T1o, T1q, T2k;
Chris@82 226 T1g = cr[WS(rs, 5)];
Chris@82 227 T1h = T1f * T1g;
Chris@82 228 T1k = ci[WS(rs, 5)];
Chris@82 229 T2i = T1f * T1k;
Chris@82 230 T1n = cr[WS(rs, 13)];
Chris@82 231 T1o = T1m * T1n;
Chris@82 232 T1q = ci[WS(rs, 13)];
Chris@82 233 T2k = T1m * T1q;
Chris@82 234 {
Chris@82 235 E T1l, T1r, T2j, T2l;
Chris@82 236 T1l = FMA(T1j, T1k, T1h);
Chris@82 237 T1r = FMA(T1p, T1q, T1o);
Chris@82 238 T1s = T1l + T1r;
Chris@82 239 T2t = T1l - T1r;
Chris@82 240 T2j = FNMS(T1j, T1g, T2i);
Chris@82 241 T2l = FNMS(T1p, T1n, T2k);
Chris@82 242 T2m = T2j - T2l;
Chris@82 243 T3d = T2j + T2l;
Chris@82 244 }
Chris@82 245 }
Chris@82 246 {
Chris@82 247 E TQ, TR, TU, T29, TX, TY, T10, T2b;
Chris@82 248 TQ = cr[WS(rs, 14)];
Chris@82 249 TR = TP * TQ;
Chris@82 250 TU = ci[WS(rs, 14)];
Chris@82 251 T29 = TP * TU;
Chris@82 252 TX = cr[WS(rs, 6)];
Chris@82 253 TY = TW * TX;
Chris@82 254 T10 = ci[WS(rs, 6)];
Chris@82 255 T2b = TW * T10;
Chris@82 256 {
Chris@82 257 E TV, T11, T2a, T2c;
Chris@82 258 TV = FMA(TT, TU, TR);
Chris@82 259 T11 = FMA(TZ, T10, TY);
Chris@82 260 T12 = TV + T11;
Chris@82 261 T28 = TV - T11;
Chris@82 262 T2a = FNMS(TT, TQ, T29);
Chris@82 263 T2c = FNMS(TZ, TX, T2b);
Chris@82 264 T2d = T2a - T2c;
Chris@82 265 T37 = T2a + T2c;
Chris@82 266 }
Chris@82 267 }
Chris@82 268 {
Chris@82 269 E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
Chris@82 270 {
Chris@82 271 E Ty, T13, T3v, T3B;
Chris@82 272 Ty = Te + Tx;
Chris@82 273 T13 = TL + T12;
Chris@82 274 T14 = Ty + T13;
Chris@82 275 T3q = Ty - T13;
Chris@82 276 T3v = T38 + T37;
Chris@82 277 T3B = T3w + T3A;
Chris@82 278 T3C = T3v + T3B;
Chris@82 279 T3E = T3B - T3v;
Chris@82 280 }
Chris@82 281 {
Chris@82 282 E T1t, T1S, T3r, T3s;
Chris@82 283 T1t = T1d + T1s;
Chris@82 284 T1S = T1G + T1R;
Chris@82 285 T1T = T1t + T1S;
Chris@82 286 T3D = T1S - T1t;
Chris@82 287 T3r = T3h + T3i;
Chris@82 288 T3s = T3c + T3d;
Chris@82 289 T3t = T3r - T3s;
Chris@82 290 T3u = T3s + T3r;
Chris@82 291 }
Chris@82 292 ci[WS(rs, 7)] = T14 - T1T;
Chris@82 293 cr[WS(rs, 12)] = T3D - T3E;
Chris@82 294 ci[WS(rs, 11)] = T3D + T3E;
Chris@82 295 cr[0] = T14 + T1T;
Chris@82 296 cr[WS(rs, 4)] = T3q - T3t;
Chris@82 297 cr[WS(rs, 8)] = T3u - T3C;
Chris@82 298 ci[WS(rs, 15)] = T3u + T3C;
Chris@82 299 ci[WS(rs, 3)] = T3q + T3t;
Chris@82 300 }
Chris@82 301 {
Chris@82 302 E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
Chris@82 303 {
Chris@82 304 E T36, T39, T3F, T3G;
Chris@82 305 T36 = Te - Tx;
Chris@82 306 T39 = T37 - T38;
Chris@82 307 T3a = T36 - T39;
Chris@82 308 T3m = T36 + T39;
Chris@82 309 T3F = TL - T12;
Chris@82 310 T3G = T3A - T3w;
Chris@82 311 T3H = T3F + T3G;
Chris@82 312 T3J = T3G - T3F;
Chris@82 313 }
Chris@82 314 {
Chris@82 315 E T3b, T3e, T3g, T3j;
Chris@82 316 T3b = T1d - T1s;
Chris@82 317 T3e = T3c - T3d;
Chris@82 318 T3f = T3b + T3e;
Chris@82 319 T3n = T3b - T3e;
Chris@82 320 T3g = T1G - T1R;
Chris@82 321 T3j = T3h - T3i;
Chris@82 322 T3k = T3g - T3j;
Chris@82 323 T3o = T3g + T3j;
Chris@82 324 }
Chris@82 325 {
Chris@82 326 E T3l, T3K, T3p, T3I;
Chris@82 327 T3l = T3f + T3k;
Chris@82 328 ci[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
Chris@82 329 cr[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
Chris@82 330 T3K = T3o - T3n;
Chris@82 331 cr[WS(rs, 10)] = FMS(KP707106781, T3K, T3J);
Chris@82 332 ci[WS(rs, 13)] = FMA(KP707106781, T3K, T3J);
Chris@82 333 T3p = T3n + T3o;
Chris@82 334 cr[WS(rs, 6)] = FNMS(KP707106781, T3p, T3m);
Chris@82 335 ci[WS(rs, 1)] = FMA(KP707106781, T3p, T3m);
Chris@82 336 T3I = T3k - T3f;
Chris@82 337 cr[WS(rs, 14)] = FMS(KP707106781, T3I, T3H);
Chris@82 338 ci[WS(rs, 9)] = FMA(KP707106781, T3I, T3H);
Chris@82 339 }
Chris@82 340 }
Chris@82 341 {
Chris@82 342 E T20, T3N, T3T, T2Q, T2f, T3U, T30, T33, T2T, T3O, T2v, T2N, T2X, T34, T2K;
Chris@82 343 E T2O;
Chris@82 344 {
Chris@82 345 E T27, T2e, T2n, T2u;
Chris@82 346 T20 = T1U - T1Z;
Chris@82 347 T3N = T3L + T3M;
Chris@82 348 T3T = T3M - T3L;
Chris@82 349 T2Q = T1U + T1Z;
Chris@82 350 T27 = T21 - T26;
Chris@82 351 T2e = T28 + T2d;
Chris@82 352 T2f = T27 + T2e;
Chris@82 353 T3U = T2e - T27;
Chris@82 354 {
Chris@82 355 E T2Y, T2Z, T2R, T2S;
Chris@82 356 T2Y = T2w + T2B;
Chris@82 357 T2Z = T2I + T2D;
Chris@82 358 T30 = FNMS(KP414213562, T2Z, T2Y);
Chris@82 359 T33 = FMA(KP414213562, T2Y, T2Z);
Chris@82 360 T2R = T21 + T26;
Chris@82 361 T2S = T28 - T2d;
Chris@82 362 T2T = T2R + T2S;
Chris@82 363 T3O = T2R - T2S;
Chris@82 364 }
Chris@82 365 T2n = T2h - T2m;
Chris@82 366 T2u = T2s + T2t;
Chris@82 367 T2v = FNMS(KP414213562, T2u, T2n);
Chris@82 368 T2N = FMA(KP414213562, T2n, T2u);
Chris@82 369 {
Chris@82 370 E T2V, T2W, T2C, T2J;
Chris@82 371 T2V = T2h + T2m;
Chris@82 372 T2W = T2s - T2t;
Chris@82 373 T2X = FMA(KP414213562, T2W, T2V);
Chris@82 374 T34 = FNMS(KP414213562, T2V, T2W);
Chris@82 375 T2C = T2w - T2B;
Chris@82 376 T2J = T2D - T2I;
Chris@82 377 T2K = FNMS(KP414213562, T2J, T2C);
Chris@82 378 T2O = FMA(KP414213562, T2C, T2J);
Chris@82 379 }
Chris@82 380 }
Chris@82 381 {
Chris@82 382 E T2g, T2L, T3V, T3W;
Chris@82 383 T2g = FMA(KP707106781, T2f, T20);
Chris@82 384 T2L = T2v + T2K;
Chris@82 385 cr[WS(rs, 7)] = FNMS(KP923879532, T2L, T2g);
Chris@82 386 ci[0] = FMA(KP923879532, T2L, T2g);
Chris@82 387 T3V = FMA(KP707106781, T3U, T3T);
Chris@82 388 T3W = T34 + T33;
Chris@82 389 cr[WS(rs, 9)] = FMS(KP923879532, T3W, T3V);
Chris@82 390 ci[WS(rs, 14)] = FMA(KP923879532, T3W, T3V);
Chris@82 391 }
Chris@82 392 {
Chris@82 393 E T3X, T3Y, T2M, T2P;
Chris@82 394 T3X = FNMS(KP707106781, T3U, T3T);
Chris@82 395 T3Y = T30 - T2X;
Chris@82 396 cr[WS(rs, 13)] = FMS(KP923879532, T3Y, T3X);
Chris@82 397 ci[WS(rs, 10)] = FMA(KP923879532, T3Y, T3X);
Chris@82 398 T2M = FNMS(KP707106781, T2f, T20);
Chris@82 399 T2P = T2N + T2O;
Chris@82 400 ci[WS(rs, 4)] = FNMS(KP923879532, T2P, T2M);
Chris@82 401 cr[WS(rs, 3)] = FMA(KP923879532, T2P, T2M);
Chris@82 402 }
Chris@82 403 {
Chris@82 404 E T2U, T31, T3P, T3Q;
Chris@82 405 T2U = FMA(KP707106781, T2T, T2Q);
Chris@82 406 T31 = T2X + T30;
Chris@82 407 ci[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
Chris@82 408 cr[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
Chris@82 409 T3P = FMA(KP707106781, T3O, T3N);
Chris@82 410 T3Q = T2O - T2N;
Chris@82 411 cr[WS(rs, 15)] = FMS(KP923879532, T3Q, T3P);
Chris@82 412 ci[WS(rs, 8)] = FMA(KP923879532, T3Q, T3P);
Chris@82 413 }
Chris@82 414 {
Chris@82 415 E T3R, T3S, T32, T35;
Chris@82 416 T3R = FNMS(KP707106781, T3O, T3N);
Chris@82 417 T3S = T2K - T2v;
Chris@82 418 cr[WS(rs, 11)] = FMS(KP923879532, T3S, T3R);
Chris@82 419 ci[WS(rs, 12)] = FMA(KP923879532, T3S, T3R);
Chris@82 420 T32 = FNMS(KP707106781, T2T, T2Q);
Chris@82 421 T35 = T33 - T34;
Chris@82 422 cr[WS(rs, 5)] = FNMS(KP923879532, T35, T32);
Chris@82 423 ci[WS(rs, 2)] = FMA(KP923879532, T35, T32);
Chris@82 424 }
Chris@82 425 }
Chris@82 426 }
Chris@82 427 }
Chris@82 428 }
Chris@82 429 }
Chris@82 430
Chris@82 431 static const tw_instr twinstr[] = {
Chris@82 432 {TW_CEXP, 1, 1},
Chris@82 433 {TW_CEXP, 1, 3},
Chris@82 434 {TW_CEXP, 1, 9},
Chris@82 435 {TW_CEXP, 1, 15},
Chris@82 436 {TW_NEXT, 1, 0}
Chris@82 437 };
Chris@82 438
Chris@82 439 static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, {104, 42, 92, 0} };
Chris@82 440
Chris@82 441 void X(codelet_hf2_16) (planner *p) {
Chris@82 442 X(khc2hc_register) (p, hf2_16, &desc);
Chris@82 443 }
Chris@82 444 #else
Chris@82 445
Chris@82 446 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include rdft/scalar/hf.h */
Chris@82 447
Chris@82 448 /*
Chris@82 449 * This function contains 196 FP additions, 108 FP multiplications,
Chris@82 450 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@82 451 * 82 stack variables, 3 constants, and 64 memory accesses
Chris@82 452 */
Chris@82 453 #include "rdft/scalar/hf.h"
Chris@82 454
Chris@82 455 static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 456 {
Chris@82 457 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 458 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 459 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 460 {
Chris@82 461 INT m;
Chris@82 462 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 463 E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
Chris@82 464 E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
Chris@82 465 {
Chris@82 466 E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
Chris@82 467 {
Chris@82 468 E Th, Tn, Tj, Tm;
Chris@82 469 T2 = W[0];
Chris@82 470 T5 = W[1];
Chris@82 471 Tg = W[2];
Chris@82 472 Ti = W[3];
Chris@82 473 Th = T2 * Tg;
Chris@82 474 Tn = T5 * Tg;
Chris@82 475 Tj = T5 * Ti;
Chris@82 476 Tm = T2 * Ti;
Chris@82 477 Tk = Th - Tj;
Chris@82 478 To = Tm + Tn;
Chris@82 479 TE = Tm - Tn;
Chris@82 480 TC = Th + Tj;
Chris@82 481 T6 = W[5];
Chris@82 482 T7 = T5 * T6;
Chris@82 483 Tv = Tg * T6;
Chris@82 484 Ta = T2 * T6;
Chris@82 485 Ts = Ti * T6;
Chris@82 486 T3 = W[4];
Chris@82 487 T4 = T2 * T3;
Chris@82 488 Tw = Ti * T3;
Chris@82 489 Tb = T5 * T3;
Chris@82 490 Tr = Tg * T3;
Chris@82 491 }
Chris@82 492 T8 = T4 + T7;
Chris@82 493 TW = Tv - Tw;
Chris@82 494 TJ = Ta + Tb;
Chris@82 495 Tt = Tr - Ts;
Chris@82 496 TU = Tr + Ts;
Chris@82 497 Tc = Ta - Tb;
Chris@82 498 Tx = Tv + Tw;
Chris@82 499 TH = T4 - T7;
Chris@82 500 TN = W[6];
Chris@82 501 TO = W[7];
Chris@82 502 TP = FMA(T2, TN, T5 * TO);
Chris@82 503 TR = FNMS(T5, TN, T2 * TO);
Chris@82 504 {
Chris@82 505 E T1d, T1e, T19, T1a;
Chris@82 506 T1d = Tk * T6;
Chris@82 507 T1e = To * T3;
Chris@82 508 T1f = T1d - T1e;
Chris@82 509 T1k = T1d + T1e;
Chris@82 510 T19 = Tk * T3;
Chris@82 511 T1a = To * T6;
Chris@82 512 T1b = T19 + T1a;
Chris@82 513 T1i = T19 - T1a;
Chris@82 514 }
Chris@82 515 {
Chris@82 516 E T1w, T1x, T1s, T1t;
Chris@82 517 T1w = TC * T6;
Chris@82 518 T1x = TE * T3;
Chris@82 519 T1y = T1w - T1x;
Chris@82 520 T1H = T1w + T1x;
Chris@82 521 T1s = TC * T3;
Chris@82 522 T1t = TE * T6;
Chris@82 523 T1u = T1s + T1t;
Chris@82 524 T1F = T1s - T1t;
Chris@82 525 }
Chris@82 526 }
Chris@82 527 {
Chris@82 528 E Tf, T3s, T1N, T3e, TA, T3r, T1Q, T3b, TM, T2N, T1W, T2w, TZ, T2M, T21;
Chris@82 529 E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2E, T2o, T2D, T18, T1n, T2Q, T2R;
Chris@82 530 E T2S, T2T, T28, T2B, T2d, T2A;
Chris@82 531 {
Chris@82 532 E T1, T3d, Te, T3c, T9, Td;
Chris@82 533 T1 = cr[0];
Chris@82 534 T3d = ci[0];
Chris@82 535 T9 = cr[WS(rs, 8)];
Chris@82 536 Td = ci[WS(rs, 8)];
Chris@82 537 Te = FMA(T8, T9, Tc * Td);
Chris@82 538 T3c = FNMS(Tc, T9, T8 * Td);
Chris@82 539 Tf = T1 + Te;
Chris@82 540 T3s = T3d - T3c;
Chris@82 541 T1N = T1 - Te;
Chris@82 542 T3e = T3c + T3d;
Chris@82 543 }
Chris@82 544 {
Chris@82 545 E Tq, T1O, Tz, T1P;
Chris@82 546 {
Chris@82 547 E Tl, Tp, Tu, Ty;
Chris@82 548 Tl = cr[WS(rs, 4)];
Chris@82 549 Tp = ci[WS(rs, 4)];
Chris@82 550 Tq = FMA(Tk, Tl, To * Tp);
Chris@82 551 T1O = FNMS(To, Tl, Tk * Tp);
Chris@82 552 Tu = cr[WS(rs, 12)];
Chris@82 553 Ty = ci[WS(rs, 12)];
Chris@82 554 Tz = FMA(Tt, Tu, Tx * Ty);
Chris@82 555 T1P = FNMS(Tx, Tu, Tt * Ty);
Chris@82 556 }
Chris@82 557 TA = Tq + Tz;
Chris@82 558 T3r = Tq - Tz;
Chris@82 559 T1Q = T1O - T1P;
Chris@82 560 T3b = T1O + T1P;
Chris@82 561 }
Chris@82 562 {
Chris@82 563 E TG, T1T, TL, T1U, T1S, T1V;
Chris@82 564 {
Chris@82 565 E TD, TF, TI, TK;
Chris@82 566 TD = cr[WS(rs, 2)];
Chris@82 567 TF = ci[WS(rs, 2)];
Chris@82 568 TG = FMA(TC, TD, TE * TF);
Chris@82 569 T1T = FNMS(TE, TD, TC * TF);
Chris@82 570 TI = cr[WS(rs, 10)];
Chris@82 571 TK = ci[WS(rs, 10)];
Chris@82 572 TL = FMA(TH, TI, TJ * TK);
Chris@82 573 T1U = FNMS(TJ, TI, TH * TK);
Chris@82 574 }
Chris@82 575 TM = TG + TL;
Chris@82 576 T2N = T1T + T1U;
Chris@82 577 T1S = TG - TL;
Chris@82 578 T1V = T1T - T1U;
Chris@82 579 T1W = T1S - T1V;
Chris@82 580 T2w = T1S + T1V;
Chris@82 581 }
Chris@82 582 {
Chris@82 583 E TT, T1Y, TY, T1Z, T1X, T20;
Chris@82 584 {
Chris@82 585 E TQ, TS, TV, TX;
Chris@82 586 TQ = cr[WS(rs, 14)];
Chris@82 587 TS = ci[WS(rs, 14)];
Chris@82 588 TT = FMA(TP, TQ, TR * TS);
Chris@82 589 T1Y = FNMS(TR, TQ, TP * TS);
Chris@82 590 TV = cr[WS(rs, 6)];
Chris@82 591 TX = ci[WS(rs, 6)];
Chris@82 592 TY = FMA(TU, TV, TW * TX);
Chris@82 593 T1Z = FNMS(TW, TV, TU * TX);
Chris@82 594 }
Chris@82 595 TZ = TT + TY;
Chris@82 596 T2M = T1Y + T1Z;
Chris@82 597 T1X = TT - TY;
Chris@82 598 T20 = T1Y - T1Z;
Chris@82 599 T21 = T1X + T20;
Chris@82 600 T2x = T1X - T20;
Chris@82 601 }
Chris@82 602 {
Chris@82 603 E T1r, T2f, T1J, T2m, T1A, T2g, T1E, T2l;
Chris@82 604 {
Chris@82 605 E T1p, T1q, T1G, T1I;
Chris@82 606 T1p = cr[WS(rs, 15)];
Chris@82 607 T1q = ci[WS(rs, 15)];
Chris@82 608 T1r = FMA(TN, T1p, TO * T1q);
Chris@82 609 T2f = FNMS(TO, T1p, TN * T1q);
Chris@82 610 T1G = cr[WS(rs, 11)];
Chris@82 611 T1I = ci[WS(rs, 11)];
Chris@82 612 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@82 613 T2m = FNMS(T1H, T1G, T1F * T1I);
Chris@82 614 }
Chris@82 615 {
Chris@82 616 E T1v, T1z, T1C, T1D;
Chris@82 617 T1v = cr[WS(rs, 7)];
Chris@82 618 T1z = ci[WS(rs, 7)];
Chris@82 619 T1A = FMA(T1u, T1v, T1y * T1z);
Chris@82 620 T2g = FNMS(T1y, T1v, T1u * T1z);
Chris@82 621 T1C = cr[WS(rs, 3)];
Chris@82 622 T1D = ci[WS(rs, 3)];
Chris@82 623 T1E = FMA(Tg, T1C, Ti * T1D);
Chris@82 624 T2l = FNMS(Ti, T1C, Tg * T1D);
Chris@82 625 }
Chris@82 626 T1B = T1r + T1A;
Chris@82 627 T1K = T1E + T1J;
Chris@82 628 T2V = T1B - T1K;
Chris@82 629 T2W = T2f + T2g;
Chris@82 630 T2X = T2l + T2m;
Chris@82 631 T2Y = T2W - T2X;
Chris@82 632 {
Chris@82 633 E T2h, T2i, T2k, T2n;
Chris@82 634 T2h = T2f - T2g;
Chris@82 635 T2i = T1E - T1J;
Chris@82 636 T2j = T2h + T2i;
Chris@82 637 T2E = T2h - T2i;
Chris@82 638 T2k = T1r - T1A;
Chris@82 639 T2n = T2l - T2m;
Chris@82 640 T2o = T2k - T2n;
Chris@82 641 T2D = T2k + T2n;
Chris@82 642 }
Chris@82 643 }
Chris@82 644 {
Chris@82 645 E T14, T29, T1m, T26, T17, T2a, T1h, T25;
Chris@82 646 {
Chris@82 647 E T12, T13, T1j, T1l;
Chris@82 648 T12 = cr[WS(rs, 1)];
Chris@82 649 T13 = ci[WS(rs, 1)];
Chris@82 650 T14 = FMA(T2, T12, T5 * T13);
Chris@82 651 T29 = FNMS(T5, T12, T2 * T13);
Chris@82 652 T1j = cr[WS(rs, 13)];
Chris@82 653 T1l = ci[WS(rs, 13)];
Chris@82 654 T1m = FMA(T1i, T1j, T1k * T1l);
Chris@82 655 T26 = FNMS(T1k, T1j, T1i * T1l);
Chris@82 656 }
Chris@82 657 {
Chris@82 658 E T15, T16, T1c, T1g;
Chris@82 659 T15 = cr[WS(rs, 9)];
Chris@82 660 T16 = ci[WS(rs, 9)];
Chris@82 661 T17 = FMA(T3, T15, T6 * T16);
Chris@82 662 T2a = FNMS(T6, T15, T3 * T16);
Chris@82 663 T1c = cr[WS(rs, 5)];
Chris@82 664 T1g = ci[WS(rs, 5)];
Chris@82 665 T1h = FMA(T1b, T1c, T1f * T1g);
Chris@82 666 T25 = FNMS(T1f, T1c, T1b * T1g);
Chris@82 667 }
Chris@82 668 T18 = T14 + T17;
Chris@82 669 T1n = T1h + T1m;
Chris@82 670 T2Q = T18 - T1n;
Chris@82 671 T2R = T29 + T2a;
Chris@82 672 T2S = T25 + T26;
Chris@82 673 T2T = T2R - T2S;
Chris@82 674 {
Chris@82 675 E T24, T27, T2b, T2c;
Chris@82 676 T24 = T14 - T17;
Chris@82 677 T27 = T25 - T26;
Chris@82 678 T28 = T24 - T27;
Chris@82 679 T2B = T24 + T27;
Chris@82 680 T2b = T29 - T2a;
Chris@82 681 T2c = T1h - T1m;
Chris@82 682 T2d = T2b + T2c;
Chris@82 683 T2A = T2b - T2c;
Chris@82 684 }
Chris@82 685 }
Chris@82 686 {
Chris@82 687 E T23, T2r, T3u, T3w, T2q, T3v, T2u, T3p;
Chris@82 688 {
Chris@82 689 E T1R, T22, T3q, T3t;
Chris@82 690 T1R = T1N - T1Q;
Chris@82 691 T22 = KP707106781 * (T1W + T21);
Chris@82 692 T23 = T1R + T22;
Chris@82 693 T2r = T1R - T22;
Chris@82 694 T3q = KP707106781 * (T2w - T2x);
Chris@82 695 T3t = T3r + T3s;
Chris@82 696 T3u = T3q + T3t;
Chris@82 697 T3w = T3t - T3q;
Chris@82 698 }
Chris@82 699 {
Chris@82 700 E T2e, T2p, T2s, T2t;
Chris@82 701 T2e = FNMS(KP382683432, T2d, KP923879532 * T28);
Chris@82 702 T2p = FMA(KP382683432, T2j, KP923879532 * T2o);
Chris@82 703 T2q = T2e + T2p;
Chris@82 704 T3v = T2p - T2e;
Chris@82 705 T2s = FMA(KP923879532, T2d, KP382683432 * T28);
Chris@82 706 T2t = FNMS(KP923879532, T2j, KP382683432 * T2o);
Chris@82 707 T2u = T2s + T2t;
Chris@82 708 T3p = T2t - T2s;
Chris@82 709 }
Chris@82 710 cr[WS(rs, 7)] = T23 - T2q;
Chris@82 711 cr[WS(rs, 11)] = T3v - T3w;
Chris@82 712 ci[WS(rs, 12)] = T3v + T3w;
Chris@82 713 ci[0] = T23 + T2q;
Chris@82 714 ci[WS(rs, 4)] = T2r - T2u;
Chris@82 715 cr[WS(rs, 15)] = T3p - T3u;
Chris@82 716 ci[WS(rs, 8)] = T3p + T3u;
Chris@82 717 cr[WS(rs, 3)] = T2r + T2u;
Chris@82 718 }
Chris@82 719 {
Chris@82 720 E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
Chris@82 721 {
Chris@82 722 E TB, T10, T3a, T3f;
Chris@82 723 TB = Tf + TA;
Chris@82 724 T10 = TM + TZ;
Chris@82 725 T11 = TB + T10;
Chris@82 726 T35 = TB - T10;
Chris@82 727 T3a = T2N + T2M;
Chris@82 728 T3f = T3b + T3e;
Chris@82 729 T3g = T3a + T3f;
Chris@82 730 T3i = T3f - T3a;
Chris@82 731 }
Chris@82 732 {
Chris@82 733 E T1o, T1L, T36, T37;
Chris@82 734 T1o = T18 + T1n;
Chris@82 735 T1L = T1B + T1K;
Chris@82 736 T1M = T1o + T1L;
Chris@82 737 T3h = T1L - T1o;
Chris@82 738 T36 = T2W + T2X;
Chris@82 739 T37 = T2R + T2S;
Chris@82 740 T38 = T36 - T37;
Chris@82 741 T39 = T37 + T36;
Chris@82 742 }
Chris@82 743 ci[WS(rs, 7)] = T11 - T1M;
Chris@82 744 cr[WS(rs, 12)] = T3h - T3i;
Chris@82 745 ci[WS(rs, 11)] = T3h + T3i;
Chris@82 746 cr[0] = T11 + T1M;
Chris@82 747 cr[WS(rs, 4)] = T35 - T38;
Chris@82 748 cr[WS(rs, 8)] = T39 - T3g;
Chris@82 749 ci[WS(rs, 15)] = T39 + T3g;
Chris@82 750 ci[WS(rs, 3)] = T35 + T38;
Chris@82 751 }
Chris@82 752 {
Chris@82 753 E T2z, T2H, T3A, T3C, T2G, T3B, T2K, T3x;
Chris@82 754 {
Chris@82 755 E T2v, T2y, T3y, T3z;
Chris@82 756 T2v = T1N + T1Q;
Chris@82 757 T2y = KP707106781 * (T2w + T2x);
Chris@82 758 T2z = T2v + T2y;
Chris@82 759 T2H = T2v - T2y;
Chris@82 760 T3y = KP707106781 * (T21 - T1W);
Chris@82 761 T3z = T3s - T3r;
Chris@82 762 T3A = T3y + T3z;
Chris@82 763 T3C = T3z - T3y;
Chris@82 764 }
Chris@82 765 {
Chris@82 766 E T2C, T2F, T2I, T2J;
Chris@82 767 T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
Chris@82 768 T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
Chris@82 769 T2G = T2C + T2F;
Chris@82 770 T3B = T2F - T2C;
Chris@82 771 T2I = FNMS(KP923879532, T2A, KP382683432 * T2B);
Chris@82 772 T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
Chris@82 773 T2K = T2I + T2J;
Chris@82 774 T3x = T2J - T2I;
Chris@82 775 }
Chris@82 776 ci[WS(rs, 6)] = T2z - T2G;
Chris@82 777 cr[WS(rs, 13)] = T3B - T3C;
Chris@82 778 ci[WS(rs, 10)] = T3B + T3C;
Chris@82 779 cr[WS(rs, 1)] = T2z + T2G;
Chris@82 780 cr[WS(rs, 5)] = T2H - T2K;
Chris@82 781 cr[WS(rs, 9)] = T3x - T3A;
Chris@82 782 ci[WS(rs, 14)] = T3x + T3A;
Chris@82 783 ci[WS(rs, 2)] = T2H + T2K;
Chris@82 784 }
Chris@82 785 {
Chris@82 786 E T2P, T31, T3m, T3o, T30, T3j, T34, T3n;
Chris@82 787 {
Chris@82 788 E T2L, T2O, T3k, T3l;
Chris@82 789 T2L = Tf - TA;
Chris@82 790 T2O = T2M - T2N;
Chris@82 791 T2P = T2L - T2O;
Chris@82 792 T31 = T2L + T2O;
Chris@82 793 T3k = TM - TZ;
Chris@82 794 T3l = T3e - T3b;
Chris@82 795 T3m = T3k + T3l;
Chris@82 796 T3o = T3l - T3k;
Chris@82 797 }
Chris@82 798 {
Chris@82 799 E T2U, T2Z, T32, T33;
Chris@82 800 T2U = T2Q + T2T;
Chris@82 801 T2Z = T2V - T2Y;
Chris@82 802 T30 = KP707106781 * (T2U + T2Z);
Chris@82 803 T3j = KP707106781 * (T2Z - T2U);
Chris@82 804 T32 = T2Q - T2T;
Chris@82 805 T33 = T2V + T2Y;
Chris@82 806 T34 = KP707106781 * (T32 + T33);
Chris@82 807 T3n = KP707106781 * (T33 - T32);
Chris@82 808 }
Chris@82 809 ci[WS(rs, 5)] = T2P - T30;
Chris@82 810 cr[WS(rs, 10)] = T3n - T3o;
Chris@82 811 ci[WS(rs, 13)] = T3n + T3o;
Chris@82 812 cr[WS(rs, 2)] = T2P + T30;
Chris@82 813 cr[WS(rs, 6)] = T31 - T34;
Chris@82 814 cr[WS(rs, 14)] = T3j - T3m;
Chris@82 815 ci[WS(rs, 9)] = T3j + T3m;
Chris@82 816 ci[WS(rs, 1)] = T31 + T34;
Chris@82 817 }
Chris@82 818 }
Chris@82 819 }
Chris@82 820 }
Chris@82 821 }
Chris@82 822
Chris@82 823 static const tw_instr twinstr[] = {
Chris@82 824 {TW_CEXP, 1, 1},
Chris@82 825 {TW_CEXP, 1, 3},
Chris@82 826 {TW_CEXP, 1, 9},
Chris@82 827 {TW_CEXP, 1, 15},
Chris@82 828 {TW_NEXT, 1, 0}
Chris@82 829 };
Chris@82 830
Chris@82 831 static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, {156, 68, 40, 0} };
Chris@82 832
Chris@82 833 void X(codelet_hf2_16) (planner *p) {
Chris@82 834 X(khc2hc_register) (p, hf2_16, &desc);
Chris@82 835 }
Chris@82 836 #endif