annotate src/fftw-3.3.8/rdft/scalar/r2cb/hb2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:37 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include rdft/scalar/hb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@82 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@82 33 * 93 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hb.h"
Chris@82 36
Chris@82 37 static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 41 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 45 E Tv, Tw, T2z, T2C, TB, TF, Ty, Tz, T1V, TA, T2G, T3Q, T3C, T3g, T3L;
Chris@82 46 E T30, T3m, T3z, T3w, T3s, T1X, T1Y, T2u, T2c, T2p, TE, TG, T1G, T1o, T1D;
Chris@82 47 {
Chris@82 48 E T3f, T3l, T2F, T3r, T2Z, T3v, TD, Tx;
Chris@82 49 Tv = W[0];
Chris@82 50 Tw = W[2];
Chris@82 51 Tx = Tv * Tw;
Chris@82 52 T2z = W[6];
Chris@82 53 T3f = Tv * T2z;
Chris@82 54 T2C = W[7];
Chris@82 55 T3l = Tv * T2C;
Chris@82 56 TB = W[4];
Chris@82 57 T2F = Tv * TB;
Chris@82 58 T3r = Tw * TB;
Chris@82 59 TF = W[5];
Chris@82 60 T2Z = Tv * TF;
Chris@82 61 T3v = Tw * TF;
Chris@82 62 Ty = W[1];
Chris@82 63 Tz = W[3];
Chris@82 64 TD = Tv * Tz;
Chris@82 65 T1V = FMA(Ty, Tz, Tx);
Chris@82 66 TA = FNMS(Ty, Tz, Tx);
Chris@82 67 T2G = FNMS(Ty, TF, T2F);
Chris@82 68 T3Q = FMA(Tz, TB, T3v);
Chris@82 69 T3C = FNMS(Ty, TB, T2Z);
Chris@82 70 T3g = FMA(Ty, T2C, T3f);
Chris@82 71 T3L = FNMS(Tz, TF, T3r);
Chris@82 72 T30 = FMA(Ty, TB, T2Z);
Chris@82 73 T3m = FNMS(Ty, T2z, T3l);
Chris@82 74 T3z = FMA(Ty, TF, T2F);
Chris@82 75 T3w = FNMS(Tz, TB, T3v);
Chris@82 76 T3s = FMA(Tz, TF, T3r);
Chris@82 77 {
Chris@82 78 E T1W, T2b, TC, T1n;
Chris@82 79 T1W = T1V * TB;
Chris@82 80 T2b = T1V * TF;
Chris@82 81 T1X = FNMS(Ty, Tw, TD);
Chris@82 82 T1Y = FNMS(T1X, TF, T1W);
Chris@82 83 T2u = FNMS(T1X, TB, T2b);
Chris@82 84 T2c = FMA(T1X, TB, T2b);
Chris@82 85 T2p = FMA(T1X, TF, T1W);
Chris@82 86 TC = TA * TB;
Chris@82 87 T1n = TA * TF;
Chris@82 88 TE = FMA(Ty, Tw, TD);
Chris@82 89 TG = FNMS(TE, TF, TC);
Chris@82 90 T1G = FNMS(TE, TB, T1n);
Chris@82 91 T1o = FMA(TE, TB, T1n);
Chris@82 92 T1D = FMA(TE, TF, TC);
Chris@82 93 }
Chris@82 94 }
Chris@82 95 {
Chris@82 96 E TL, T1Z, T2d, T1t, T31, T34, T3n, T3D, T3E, T3R, T1w, T20, Tf, T3M, T2L;
Chris@82 97 E T3h, TW, T2e, T3G, T3H, T3N, T2Q, T36, T2V, T37, Tu, T3S, T18, T1z, T24;
Chris@82 98 E T2g, T27, T2h, T1j, T1y;
Chris@82 99 {
Chris@82 100 E T3, TH, TU, T2I, T1s, T32, T6, T1p, Ta, TM, TK, T33, TP, T2J, Td;
Chris@82 101 E TR;
Chris@82 102 {
Chris@82 103 E T1, T2, TS, TT;
Chris@82 104 T1 = cr[0];
Chris@82 105 T2 = ci[WS(rs, 7)];
Chris@82 106 T3 = T1 + T2;
Chris@82 107 TH = T1 - T2;
Chris@82 108 TS = ci[WS(rs, 9)];
Chris@82 109 TT = cr[WS(rs, 14)];
Chris@82 110 TU = TS + TT;
Chris@82 111 T2I = TS - TT;
Chris@82 112 }
Chris@82 113 {
Chris@82 114 E T1q, T1r, T4, T5;
Chris@82 115 T1q = ci[WS(rs, 15)];
Chris@82 116 T1r = cr[WS(rs, 8)];
Chris@82 117 T1s = T1q + T1r;
Chris@82 118 T32 = T1q - T1r;
Chris@82 119 T4 = cr[WS(rs, 4)];
Chris@82 120 T5 = ci[WS(rs, 3)];
Chris@82 121 T6 = T4 + T5;
Chris@82 122 T1p = T4 - T5;
Chris@82 123 }
Chris@82 124 {
Chris@82 125 E T8, T9, TI, TJ;
Chris@82 126 T8 = cr[WS(rs, 2)];
Chris@82 127 T9 = ci[WS(rs, 5)];
Chris@82 128 Ta = T8 + T9;
Chris@82 129 TM = T8 - T9;
Chris@82 130 TI = ci[WS(rs, 11)];
Chris@82 131 TJ = cr[WS(rs, 12)];
Chris@82 132 TK = TI + TJ;
Chris@82 133 T33 = TI - TJ;
Chris@82 134 }
Chris@82 135 {
Chris@82 136 E TN, TO, Tb, Tc;
Chris@82 137 TN = ci[WS(rs, 13)];
Chris@82 138 TO = cr[WS(rs, 10)];
Chris@82 139 TP = TN + TO;
Chris@82 140 T2J = TN - TO;
Chris@82 141 Tb = ci[WS(rs, 1)];
Chris@82 142 Tc = cr[WS(rs, 6)];
Chris@82 143 Td = Tb + Tc;
Chris@82 144 TR = Tb - Tc;
Chris@82 145 }
Chris@82 146 TL = TH - TK;
Chris@82 147 T1Z = TH + TK;
Chris@82 148 T2d = T1s - T1p;
Chris@82 149 T1t = T1p + T1s;
Chris@82 150 T31 = Ta - Td;
Chris@82 151 T34 = T32 - T33;
Chris@82 152 T3n = T34 - T31;
Chris@82 153 {
Chris@82 154 E T1u, T1v, T7, Te;
Chris@82 155 T3D = T32 + T33;
Chris@82 156 T3E = T2J + T2I;
Chris@82 157 T3R = T3D - T3E;
Chris@82 158 T1u = TM + TP;
Chris@82 159 T1v = TR + TU;
Chris@82 160 T1w = T1u - T1v;
Chris@82 161 T20 = T1u + T1v;
Chris@82 162 T7 = T3 + T6;
Chris@82 163 Te = Ta + Td;
Chris@82 164 Tf = T7 + Te;
Chris@82 165 T3M = T7 - Te;
Chris@82 166 {
Chris@82 167 E T2H, T2K, TQ, TV;
Chris@82 168 T2H = T3 - T6;
Chris@82 169 T2K = T2I - T2J;
Chris@82 170 T2L = T2H + T2K;
Chris@82 171 T3h = T2H - T2K;
Chris@82 172 TQ = TM - TP;
Chris@82 173 TV = TR - TU;
Chris@82 174 TW = TQ + TV;
Chris@82 175 T2e = TQ - TV;
Chris@82 176 }
Chris@82 177 }
Chris@82 178 }
Chris@82 179 {
Chris@82 180 E Ti, T1e, T1c, T2N, T1h, T2O, Tl, T19, Tp, T13, T11, T2S, T16, T2T, Ts;
Chris@82 181 E TY, T2M, T2P;
Chris@82 182 {
Chris@82 183 E Tg, Th, T1a, T1b;
Chris@82 184 Tg = cr[WS(rs, 1)];
Chris@82 185 Th = ci[WS(rs, 6)];
Chris@82 186 Ti = Tg + Th;
Chris@82 187 T1e = Tg - Th;
Chris@82 188 T1a = ci[WS(rs, 14)];
Chris@82 189 T1b = cr[WS(rs, 9)];
Chris@82 190 T1c = T1a + T1b;
Chris@82 191 T2N = T1a - T1b;
Chris@82 192 }
Chris@82 193 {
Chris@82 194 E T1f, T1g, Tj, Tk;
Chris@82 195 T1f = ci[WS(rs, 10)];
Chris@82 196 T1g = cr[WS(rs, 13)];
Chris@82 197 T1h = T1f + T1g;
Chris@82 198 T2O = T1f - T1g;
Chris@82 199 Tj = cr[WS(rs, 5)];
Chris@82 200 Tk = ci[WS(rs, 2)];
Chris@82 201 Tl = Tj + Tk;
Chris@82 202 T19 = Tj - Tk;
Chris@82 203 }
Chris@82 204 {
Chris@82 205 E Tn, To, TZ, T10;
Chris@82 206 Tn = ci[0];
Chris@82 207 To = cr[WS(rs, 7)];
Chris@82 208 Tp = Tn + To;
Chris@82 209 T13 = Tn - To;
Chris@82 210 TZ = ci[WS(rs, 8)];
Chris@82 211 T10 = cr[WS(rs, 15)];
Chris@82 212 T11 = TZ + T10;
Chris@82 213 T2S = TZ - T10;
Chris@82 214 }
Chris@82 215 {
Chris@82 216 E T14, T15, Tq, Tr;
Chris@82 217 T14 = ci[WS(rs, 12)];
Chris@82 218 T15 = cr[WS(rs, 11)];
Chris@82 219 T16 = T14 + T15;
Chris@82 220 T2T = T14 - T15;
Chris@82 221 Tq = cr[WS(rs, 3)];
Chris@82 222 Tr = ci[WS(rs, 4)];
Chris@82 223 Ts = Tq + Tr;
Chris@82 224 TY = Tq - Tr;
Chris@82 225 }
Chris@82 226 T3G = T2N + T2O;
Chris@82 227 T3H = T2S + T2T;
Chris@82 228 T3N = T3H - T3G;
Chris@82 229 T2M = Ti - Tl;
Chris@82 230 T2P = T2N - T2O;
Chris@82 231 T2Q = T2M - T2P;
Chris@82 232 T36 = T2M + T2P;
Chris@82 233 {
Chris@82 234 E T2R, T2U, Tm, Tt;
Chris@82 235 T2R = Tp - Ts;
Chris@82 236 T2U = T2S - T2T;
Chris@82 237 T2V = T2R + T2U;
Chris@82 238 T37 = T2U - T2R;
Chris@82 239 Tm = Ti + Tl;
Chris@82 240 Tt = Tp + Ts;
Chris@82 241 Tu = Tm + Tt;
Chris@82 242 T3S = Tm - Tt;
Chris@82 243 }
Chris@82 244 {
Chris@82 245 E T12, T17, T22, T23;
Chris@82 246 T12 = TY - T11;
Chris@82 247 T17 = T13 - T16;
Chris@82 248 T18 = FNMS(KP414213562, T17, T12);
Chris@82 249 T1z = FMA(KP414213562, T12, T17);
Chris@82 250 T22 = T1c - T19;
Chris@82 251 T23 = T1e + T1h;
Chris@82 252 T24 = FNMS(KP414213562, T23, T22);
Chris@82 253 T2g = FMA(KP414213562, T22, T23);
Chris@82 254 }
Chris@82 255 {
Chris@82 256 E T25, T26, T1d, T1i;
Chris@82 257 T25 = TY + T11;
Chris@82 258 T26 = T13 + T16;
Chris@82 259 T27 = FNMS(KP414213562, T26, T25);
Chris@82 260 T2h = FMA(KP414213562, T25, T26);
Chris@82 261 T1d = T19 + T1c;
Chris@82 262 T1i = T1e - T1h;
Chris@82 263 T1j = FMA(KP414213562, T1i, T1d);
Chris@82 264 T1y = FNMS(KP414213562, T1d, T1i);
Chris@82 265 }
Chris@82 266 }
Chris@82 267 cr[0] = Tf + Tu;
Chris@82 268 {
Chris@82 269 E T3B, T3K, T3F, T3I, T3J, T3A;
Chris@82 270 T3A = Tf - Tu;
Chris@82 271 T3B = T3z * T3A;
Chris@82 272 T3K = T3C * T3A;
Chris@82 273 T3F = T3D + T3E;
Chris@82 274 T3I = T3G + T3H;
Chris@82 275 T3J = T3F - T3I;
Chris@82 276 ci[0] = T3F + T3I;
Chris@82 277 ci[WS(rs, 8)] = FMA(T3z, T3J, T3K);
Chris@82 278 cr[WS(rs, 8)] = FNMS(T3C, T3J, T3B);
Chris@82 279 }
Chris@82 280 {
Chris@82 281 E T3O, T3P, T3T, T3U;
Chris@82 282 T3O = T3M - T3N;
Chris@82 283 T3P = T3L * T3O;
Chris@82 284 T3T = T3R - T3S;
Chris@82 285 T3U = T3L * T3T;
Chris@82 286 cr[WS(rs, 12)] = FNMS(T3Q, T3T, T3P);
Chris@82 287 ci[WS(rs, 12)] = FMA(T3Q, T3O, T3U);
Chris@82 288 }
Chris@82 289 {
Chris@82 290 E T3V, T3W, T3X, T3Y;
Chris@82 291 T3V = T3M + T3N;
Chris@82 292 T3W = TA * T3V;
Chris@82 293 T3X = T3S + T3R;
Chris@82 294 T3Y = TA * T3X;
Chris@82 295 cr[WS(rs, 4)] = FNMS(TE, T3X, T3W);
Chris@82 296 ci[WS(rs, 4)] = FMA(TE, T3V, T3Y);
Chris@82 297 }
Chris@82 298 {
Chris@82 299 E T3j, T3t, T3p, T3x, T3i, T3o;
Chris@82 300 T3i = T37 - T36;
Chris@82 301 T3j = FNMS(KP707106781, T3i, T3h);
Chris@82 302 T3t = FMA(KP707106781, T3i, T3h);
Chris@82 303 T3o = T2Q - T2V;
Chris@82 304 T3p = FNMS(KP707106781, T3o, T3n);
Chris@82 305 T3x = FMA(KP707106781, T3o, T3n);
Chris@82 306 {
Chris@82 307 E T3k, T3q, T3u, T3y;
Chris@82 308 T3k = T3g * T3j;
Chris@82 309 cr[WS(rs, 14)] = FNMS(T3m, T3p, T3k);
Chris@82 310 T3q = T3g * T3p;
Chris@82 311 ci[WS(rs, 14)] = FMA(T3m, T3j, T3q);
Chris@82 312 T3u = T3s * T3t;
Chris@82 313 cr[WS(rs, 6)] = FNMS(T3w, T3x, T3u);
Chris@82 314 T3y = T3s * T3x;
Chris@82 315 ci[WS(rs, 6)] = FMA(T3w, T3t, T3y);
Chris@82 316 }
Chris@82 317 }
Chris@82 318 {
Chris@82 319 E T2X, T3b, T39, T3d, T2W, T35, T38;
Chris@82 320 T2W = T2Q + T2V;
Chris@82 321 T2X = FNMS(KP707106781, T2W, T2L);
Chris@82 322 T3b = FMA(KP707106781, T2W, T2L);
Chris@82 323 T35 = T31 + T34;
Chris@82 324 T38 = T36 + T37;
Chris@82 325 T39 = FNMS(KP707106781, T38, T35);
Chris@82 326 T3d = FMA(KP707106781, T38, T35);
Chris@82 327 {
Chris@82 328 E T2Y, T3a, T3c, T3e;
Chris@82 329 T2Y = T2G * T2X;
Chris@82 330 cr[WS(rs, 10)] = FNMS(T30, T39, T2Y);
Chris@82 331 T3a = T30 * T2X;
Chris@82 332 ci[WS(rs, 10)] = FMA(T2G, T39, T3a);
Chris@82 333 T3c = T1V * T3b;
Chris@82 334 cr[WS(rs, 2)] = FNMS(T1X, T3d, T3c);
Chris@82 335 T3e = T1X * T3b;
Chris@82 336 ci[WS(rs, 2)] = FMA(T1V, T3d, T3e);
Chris@82 337 }
Chris@82 338 }
Chris@82 339 {
Chris@82 340 E T29, T2l, T2j, T2n;
Chris@82 341 {
Chris@82 342 E T21, T28, T2f, T2i;
Chris@82 343 T21 = FNMS(KP707106781, T20, T1Z);
Chris@82 344 T28 = T24 + T27;
Chris@82 345 T29 = FMA(KP923879532, T28, T21);
Chris@82 346 T2l = FNMS(KP923879532, T28, T21);
Chris@82 347 T2f = FMA(KP707106781, T2e, T2d);
Chris@82 348 T2i = T2g - T2h;
Chris@82 349 T2j = FNMS(KP923879532, T2i, T2f);
Chris@82 350 T2n = FMA(KP923879532, T2i, T2f);
Chris@82 351 }
Chris@82 352 {
Chris@82 353 E T2a, T2k, T2m, T2o;
Chris@82 354 T2a = T1Y * T29;
Chris@82 355 cr[WS(rs, 11)] = FNMS(T2c, T2j, T2a);
Chris@82 356 T2k = T2c * T29;
Chris@82 357 ci[WS(rs, 11)] = FMA(T1Y, T2j, T2k);
Chris@82 358 T2m = Tw * T2l;
Chris@82 359 cr[WS(rs, 3)] = FNMS(Tz, T2n, T2m);
Chris@82 360 T2o = Tz * T2l;
Chris@82 361 ci[WS(rs, 3)] = FMA(Tw, T2n, T2o);
Chris@82 362 }
Chris@82 363 }
Chris@82 364 {
Chris@82 365 E T1l, T1E, T1B, T1H;
Chris@82 366 {
Chris@82 367 E TX, T1k, T1x, T1A;
Chris@82 368 TX = FNMS(KP707106781, TW, TL);
Chris@82 369 T1k = T18 - T1j;
Chris@82 370 T1l = FNMS(KP923879532, T1k, TX);
Chris@82 371 T1E = FMA(KP923879532, T1k, TX);
Chris@82 372 T1x = FNMS(KP707106781, T1w, T1t);
Chris@82 373 T1A = T1y - T1z;
Chris@82 374 T1B = FNMS(KP923879532, T1A, T1x);
Chris@82 375 T1H = FMA(KP923879532, T1A, T1x);
Chris@82 376 }
Chris@82 377 {
Chris@82 378 E T1m, T1C, T1F, T1I;
Chris@82 379 T1m = TG * T1l;
Chris@82 380 cr[WS(rs, 13)] = FNMS(T1o, T1B, T1m);
Chris@82 381 T1C = T1o * T1l;
Chris@82 382 ci[WS(rs, 13)] = FMA(TG, T1B, T1C);
Chris@82 383 T1F = T1D * T1E;
Chris@82 384 cr[WS(rs, 5)] = FNMS(T1G, T1H, T1F);
Chris@82 385 T1I = T1G * T1E;
Chris@82 386 ci[WS(rs, 5)] = FMA(T1D, T1H, T1I);
Chris@82 387 }
Chris@82 388 }
Chris@82 389 {
Chris@82 390 E T2s, T2A, T2x, T2D;
Chris@82 391 {
Chris@82 392 E T2q, T2r, T2v, T2w;
Chris@82 393 T2q = FMA(KP707106781, T20, T1Z);
Chris@82 394 T2r = T2g + T2h;
Chris@82 395 T2s = FNMS(KP923879532, T2r, T2q);
Chris@82 396 T2A = FMA(KP923879532, T2r, T2q);
Chris@82 397 T2v = FNMS(KP707106781, T2e, T2d);
Chris@82 398 T2w = T27 - T24;
Chris@82 399 T2x = FMA(KP923879532, T2w, T2v);
Chris@82 400 T2D = FNMS(KP923879532, T2w, T2v);
Chris@82 401 }
Chris@82 402 {
Chris@82 403 E T2t, T2y, T2B, T2E;
Chris@82 404 T2t = T2p * T2s;
Chris@82 405 cr[WS(rs, 7)] = FNMS(T2u, T2x, T2t);
Chris@82 406 T2y = T2p * T2x;
Chris@82 407 ci[WS(rs, 7)] = FMA(T2u, T2s, T2y);
Chris@82 408 T2B = T2z * T2A;
Chris@82 409 cr[WS(rs, 15)] = FNMS(T2C, T2D, T2B);
Chris@82 410 T2E = T2z * T2D;
Chris@82 411 ci[WS(rs, 15)] = FMA(T2C, T2A, T2E);
Chris@82 412 }
Chris@82 413 }
Chris@82 414 {
Chris@82 415 E T1L, T1R, T1P, T1T;
Chris@82 416 {
Chris@82 417 E T1J, T1K, T1N, T1O;
Chris@82 418 T1J = FMA(KP707106781, TW, TL);
Chris@82 419 T1K = T1y + T1z;
Chris@82 420 T1L = FNMS(KP923879532, T1K, T1J);
Chris@82 421 T1R = FMA(KP923879532, T1K, T1J);
Chris@82 422 T1N = FMA(KP707106781, T1w, T1t);
Chris@82 423 T1O = T1j + T18;
Chris@82 424 T1P = FNMS(KP923879532, T1O, T1N);
Chris@82 425 T1T = FMA(KP923879532, T1O, T1N);
Chris@82 426 }
Chris@82 427 {
Chris@82 428 E T1M, T1Q, T1S, T1U;
Chris@82 429 T1M = TB * T1L;
Chris@82 430 cr[WS(rs, 9)] = FNMS(TF, T1P, T1M);
Chris@82 431 T1Q = TB * T1P;
Chris@82 432 ci[WS(rs, 9)] = FMA(TF, T1L, T1Q);
Chris@82 433 T1S = Tv * T1R;
Chris@82 434 cr[WS(rs, 1)] = FNMS(Ty, T1T, T1S);
Chris@82 435 T1U = Tv * T1T;
Chris@82 436 ci[WS(rs, 1)] = FMA(Ty, T1R, T1U);
Chris@82 437 }
Chris@82 438 }
Chris@82 439 }
Chris@82 440 }
Chris@82 441 }
Chris@82 442 }
Chris@82 443
Chris@82 444 static const tw_instr twinstr[] = {
Chris@82 445 {TW_CEXP, 1, 1},
Chris@82 446 {TW_CEXP, 1, 3},
Chris@82 447 {TW_CEXP, 1, 9},
Chris@82 448 {TW_CEXP, 1, 15},
Chris@82 449 {TW_NEXT, 1, 0}
Chris@82 450 };
Chris@82 451
Chris@82 452 static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, {104, 42, 92, 0} };
Chris@82 453
Chris@82 454 void X(codelet_hb2_16) (planner *p) {
Chris@82 455 X(khc2hc_register) (p, hb2_16, &desc);
Chris@82 456 }
Chris@82 457 #else
Chris@82 458
Chris@82 459 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include rdft/scalar/hb.h */
Chris@82 460
Chris@82 461 /*
Chris@82 462 * This function contains 196 FP additions, 108 FP multiplications,
Chris@82 463 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@82 464 * 80 stack variables, 3 constants, and 64 memory accesses
Chris@82 465 */
Chris@82 466 #include "rdft/scalar/hb.h"
Chris@82 467
Chris@82 468 static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 469 {
Chris@82 470 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 471 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 472 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 473 {
Chris@82 474 INT m;
Chris@82 475 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 476 E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
Chris@82 477 E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
Chris@82 478 {
Chris@82 479 E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
Chris@82 480 {
Chris@82 481 E T1m, T1s, T1o, T1r;
Chris@82 482 Tv = W[0];
Chris@82 483 Ty = W[1];
Chris@82 484 T1l = W[2];
Chris@82 485 T1n = W[3];
Chris@82 486 T1m = Tv * T1l;
Chris@82 487 T1s = Ty * T1l;
Chris@82 488 T1o = Ty * T1n;
Chris@82 489 T1r = Tv * T1n;
Chris@82 490 T1p = T1m + T1o;
Chris@82 491 T1t = T1r - T1s;
Chris@82 492 T27 = T1r + T1s;
Chris@82 493 T25 = T1m - T1o;
Chris@82 494 Tz = W[5];
Chris@82 495 TA = Ty * Tz;
Chris@82 496 T1J = T1l * Tz;
Chris@82 497 T15 = Tv * Tz;
Chris@82 498 T1G = T1n * Tz;
Chris@82 499 Tw = W[4];
Chris@82 500 Tx = Tv * Tw;
Chris@82 501 T1K = T1n * Tw;
Chris@82 502 T16 = Ty * Tw;
Chris@82 503 T1F = T1l * Tw;
Chris@82 504 }
Chris@82 505 TB = Tx - TA;
Chris@82 506 T21 = T1J + T1K;
Chris@82 507 T1P = T15 - T16;
Chris@82 508 T1H = T1F + T1G;
Chris@82 509 T1X = T1F - T1G;
Chris@82 510 T17 = T15 + T16;
Chris@82 511 T1L = T1J - T1K;
Chris@82 512 T1N = Tx + TA;
Chris@82 513 T1v = W[6];
Chris@82 514 T1w = W[7];
Chris@82 515 T1x = FMA(Tv, T1v, Ty * T1w);
Chris@82 516 T1B = FNMS(Ty, T1v, Tv * T1w);
Chris@82 517 {
Chris@82 518 E T2D, T2E, T29, T2a;
Chris@82 519 T2D = T25 * Tz;
Chris@82 520 T2E = T27 * Tw;
Chris@82 521 T2F = T2D + T2E;
Chris@82 522 T2T = T2D - T2E;
Chris@82 523 T29 = T25 * Tw;
Chris@82 524 T2a = T27 * Tz;
Chris@82 525 T2b = T29 - T2a;
Chris@82 526 T2R = T29 + T2a;
Chris@82 527 }
Chris@82 528 {
Chris@82 529 E T3h, T3i, T33, T34;
Chris@82 530 T3h = T1p * Tz;
Chris@82 531 T3i = T1t * Tw;
Chris@82 532 T3j = T3h + T3i;
Chris@82 533 T3x = T3h - T3i;
Chris@82 534 T33 = T1p * Tw;
Chris@82 535 T34 = T1t * Tz;
Chris@82 536 T35 = T33 - T34;
Chris@82 537 T3t = T33 + T34;
Chris@82 538 }
Chris@82 539 }
Chris@82 540 {
Chris@82 541 E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
Chris@82 542 E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
Chris@82 543 E T3e, T3o;
Chris@82 544 {
Chris@82 545 E T3, T2c, T1e, T2d, T6, T2G, T1b, T2H;
Chris@82 546 {
Chris@82 547 E T1, T2, T1c, T1d;
Chris@82 548 T1 = cr[0];
Chris@82 549 T2 = ci[WS(rs, 7)];
Chris@82 550 T3 = T1 + T2;
Chris@82 551 T2c = T1 - T2;
Chris@82 552 T1c = ci[WS(rs, 11)];
Chris@82 553 T1d = cr[WS(rs, 12)];
Chris@82 554 T1e = T1c - T1d;
Chris@82 555 T2d = T1c + T1d;
Chris@82 556 }
Chris@82 557 {
Chris@82 558 E T4, T5, T19, T1a;
Chris@82 559 T4 = cr[WS(rs, 4)];
Chris@82 560 T5 = ci[WS(rs, 3)];
Chris@82 561 T6 = T4 + T5;
Chris@82 562 T2G = T4 - T5;
Chris@82 563 T19 = ci[WS(rs, 15)];
Chris@82 564 T1a = cr[WS(rs, 8)];
Chris@82 565 T1b = T19 - T1a;
Chris@82 566 T2H = T19 + T1a;
Chris@82 567 }
Chris@82 568 T7 = T3 + T6;
Chris@82 569 T36 = T2c + T2d;
Chris@82 570 T3k = T2H - T2G;
Chris@82 571 TC = T3 - T6;
Chris@82 572 T1f = T1b - T1e;
Chris@82 573 T2e = T2c - T2d;
Chris@82 574 T2I = T2G + T2H;
Chris@82 575 T1Q = T1b + T1e;
Chris@82 576 }
Chris@82 577 {
Chris@82 578 E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
Chris@82 579 {
Chris@82 580 E T8, T9, TG, TH;
Chris@82 581 T8 = cr[WS(rs, 2)];
Chris@82 582 T9 = ci[WS(rs, 5)];
Chris@82 583 Ta = T8 + T9;
Chris@82 584 T2f = T8 - T9;
Chris@82 585 TG = ci[WS(rs, 13)];
Chris@82 586 TH = cr[WS(rs, 10)];
Chris@82 587 TI = TG - TH;
Chris@82 588 T2g = TG + TH;
Chris@82 589 }
Chris@82 590 {
Chris@82 591 E Tb, Tc, TD, TE;
Chris@82 592 Tb = ci[WS(rs, 1)];
Chris@82 593 Tc = cr[WS(rs, 6)];
Chris@82 594 Td = Tb + Tc;
Chris@82 595 T2i = Tb - Tc;
Chris@82 596 TD = ci[WS(rs, 9)];
Chris@82 597 TE = cr[WS(rs, 14)];
Chris@82 598 TF = TD - TE;
Chris@82 599 T2j = TD + TE;
Chris@82 600 }
Chris@82 601 Te = Ta + Td;
Chris@82 602 TJ = TF - TI;
Chris@82 603 T1R = TI + TF;
Chris@82 604 T18 = Ta - Td;
Chris@82 605 {
Chris@82 606 E T2J, T2K, T2h, T2k;
Chris@82 607 T2J = T2f + T2g;
Chris@82 608 T2K = T2i + T2j;
Chris@82 609 T2L = KP707106781 * (T2J - T2K);
Chris@82 610 T37 = KP707106781 * (T2J + T2K);
Chris@82 611 T2h = T2f - T2g;
Chris@82 612 T2k = T2i - T2j;
Chris@82 613 T2l = KP707106781 * (T2h + T2k);
Chris@82 614 T3l = KP707106781 * (T2h - T2k);
Chris@82 615 }
Chris@82 616 }
Chris@82 617 {
Chris@82 618 E Ti, T2x, TR, T2y, Tl, T2u, TO, T2v, TL, TS;
Chris@82 619 {
Chris@82 620 E Tg, Th, TP, TQ;
Chris@82 621 Tg = cr[WS(rs, 1)];
Chris@82 622 Th = ci[WS(rs, 6)];
Chris@82 623 Ti = Tg + Th;
Chris@82 624 T2x = Tg - Th;
Chris@82 625 TP = ci[WS(rs, 10)];
Chris@82 626 TQ = cr[WS(rs, 13)];
Chris@82 627 TR = TP - TQ;
Chris@82 628 T2y = TP + TQ;
Chris@82 629 }
Chris@82 630 {
Chris@82 631 E Tj, Tk, TM, TN;
Chris@82 632 Tj = cr[WS(rs, 5)];
Chris@82 633 Tk = ci[WS(rs, 2)];
Chris@82 634 Tl = Tj + Tk;
Chris@82 635 T2u = Tj - Tk;
Chris@82 636 TM = ci[WS(rs, 14)];
Chris@82 637 TN = cr[WS(rs, 9)];
Chris@82 638 TO = TM - TN;
Chris@82 639 T2v = TM + TN;
Chris@82 640 }
Chris@82 641 Tm = Ti + Tl;
Chris@82 642 T1T = TO + TR;
Chris@82 643 TL = Ti - Tl;
Chris@82 644 TS = TO - TR;
Chris@82 645 TT = TL - TS;
Chris@82 646 T1h = TL + TS;
Chris@82 647 {
Chris@82 648 E T2w, T2z, T39, T3a;
Chris@82 649 T2w = T2u + T2v;
Chris@82 650 T2z = T2x - T2y;
Chris@82 651 T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
Chris@82 652 T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
Chris@82 653 T39 = T2x + T2y;
Chris@82 654 T3a = T2v - T2u;
Chris@82 655 T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
Chris@82 656 T3n = FMA(KP382683432, T3a, KP923879532 * T39);
Chris@82 657 }
Chris@82 658 }
Chris@82 659 {
Chris@82 660 E Tp, T2q, T10, T2r, Ts, T2n, TX, T2o, TU, T11;
Chris@82 661 {
Chris@82 662 E Tn, To, TY, TZ;
Chris@82 663 Tn = ci[0];
Chris@82 664 To = cr[WS(rs, 7)];
Chris@82 665 Tp = Tn + To;
Chris@82 666 T2q = Tn - To;
Chris@82 667 TY = ci[WS(rs, 12)];
Chris@82 668 TZ = cr[WS(rs, 11)];
Chris@82 669 T10 = TY - TZ;
Chris@82 670 T2r = TY + TZ;
Chris@82 671 }
Chris@82 672 {
Chris@82 673 E Tq, Tr, TV, TW;
Chris@82 674 Tq = cr[WS(rs, 3)];
Chris@82 675 Tr = ci[WS(rs, 4)];
Chris@82 676 Ts = Tq + Tr;
Chris@82 677 T2n = Tq - Tr;
Chris@82 678 TV = ci[WS(rs, 8)];
Chris@82 679 TW = cr[WS(rs, 15)];
Chris@82 680 TX = TV - TW;
Chris@82 681 T2o = TV + TW;
Chris@82 682 }
Chris@82 683 Tt = Tp + Ts;
Chris@82 684 T1U = TX + T10;
Chris@82 685 TU = Tp - Ts;
Chris@82 686 T11 = TX - T10;
Chris@82 687 T12 = TU + T11;
Chris@82 688 T1i = T11 - TU;
Chris@82 689 {
Chris@82 690 E T2p, T2s, T3c, T3d;
Chris@82 691 T2p = T2n - T2o;
Chris@82 692 T2s = T2q - T2r;
Chris@82 693 T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
Chris@82 694 T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
Chris@82 695 T3c = T2q + T2r;
Chris@82 696 T3d = T2n + T2o;
Chris@82 697 T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
Chris@82 698 T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
Chris@82 699 }
Chris@82 700 }
Chris@82 701 {
Chris@82 702 E Tf, Tu, T1O, T1S, T1V, T1W;
Chris@82 703 Tf = T7 + Te;
Chris@82 704 Tu = Tm + Tt;
Chris@82 705 T1O = Tf - Tu;
Chris@82 706 T1S = T1Q + T1R;
Chris@82 707 T1V = T1T + T1U;
Chris@82 708 T1W = T1S - T1V;
Chris@82 709 cr[0] = Tf + Tu;
Chris@82 710 ci[0] = T1S + T1V;
Chris@82 711 cr[WS(rs, 8)] = FNMS(T1P, T1W, T1N * T1O);
Chris@82 712 ci[WS(rs, 8)] = FMA(T1P, T1O, T1N * T1W);
Chris@82 713 }
Chris@82 714 {
Chris@82 715 E T3g, T3r, T3q, T3s;
Chris@82 716 {
Chris@82 717 E T38, T3f, T3m, T3p;
Chris@82 718 T38 = T36 - T37;
Chris@82 719 T3f = T3b + T3e;
Chris@82 720 T3g = T38 - T3f;
Chris@82 721 T3r = T38 + T3f;
Chris@82 722 T3m = T3k + T3l;
Chris@82 723 T3p = T3n - T3o;
Chris@82 724 T3q = T3m - T3p;
Chris@82 725 T3s = T3m + T3p;
Chris@82 726 }
Chris@82 727 cr[WS(rs, 11)] = FNMS(T3j, T3q, T35 * T3g);
Chris@82 728 ci[WS(rs, 11)] = FMA(T3j, T3g, T35 * T3q);
Chris@82 729 cr[WS(rs, 3)] = FNMS(T1n, T3s, T1l * T3r);
Chris@82 730 ci[WS(rs, 3)] = FMA(T1n, T3r, T1l * T3s);
Chris@82 731 }
Chris@82 732 {
Chris@82 733 E T3w, T3B, T3A, T3C;
Chris@82 734 {
Chris@82 735 E T3u, T3v, T3y, T3z;
Chris@82 736 T3u = T36 + T37;
Chris@82 737 T3v = T3n + T3o;
Chris@82 738 T3w = T3u - T3v;
Chris@82 739 T3B = T3u + T3v;
Chris@82 740 T3y = T3k - T3l;
Chris@82 741 T3z = T3b - T3e;
Chris@82 742 T3A = T3y + T3z;
Chris@82 743 T3C = T3y - T3z;
Chris@82 744 }
Chris@82 745 cr[WS(rs, 7)] = FNMS(T3x, T3A, T3t * T3w);
Chris@82 746 ci[WS(rs, 7)] = FMA(T3t, T3A, T3x * T3w);
Chris@82 747 cr[WS(rs, 15)] = FNMS(T1w, T3C, T1v * T3B);
Chris@82 748 ci[WS(rs, 15)] = FMA(T1v, T3C, T1w * T3B);
Chris@82 749 }
Chris@82 750 {
Chris@82 751 E T14, T1q, T1k, T1u;
Chris@82 752 {
Chris@82 753 E TK, T13, T1g, T1j;
Chris@82 754 TK = TC + TJ;
Chris@82 755 T13 = KP707106781 * (TT + T12);
Chris@82 756 T14 = TK - T13;
Chris@82 757 T1q = TK + T13;
Chris@82 758 T1g = T18 + T1f;
Chris@82 759 T1j = KP707106781 * (T1h + T1i);
Chris@82 760 T1k = T1g - T1j;
Chris@82 761 T1u = T1g + T1j;
Chris@82 762 }
Chris@82 763 cr[WS(rs, 10)] = FNMS(T17, T1k, TB * T14);
Chris@82 764 ci[WS(rs, 10)] = FMA(T17, T14, TB * T1k);
Chris@82 765 cr[WS(rs, 2)] = FNMS(T1t, T1u, T1p * T1q);
Chris@82 766 ci[WS(rs, 2)] = FMA(T1t, T1q, T1p * T1u);
Chris@82 767 }
Chris@82 768 {
Chris@82 769 E T1A, T1I, T1E, T1M;
Chris@82 770 {
Chris@82 771 E T1y, T1z, T1C, T1D;
Chris@82 772 T1y = TC - TJ;
Chris@82 773 T1z = KP707106781 * (T1i - T1h);
Chris@82 774 T1A = T1y - T1z;
Chris@82 775 T1I = T1y + T1z;
Chris@82 776 T1C = T1f - T18;
Chris@82 777 T1D = KP707106781 * (TT - T12);
Chris@82 778 T1E = T1C - T1D;
Chris@82 779 T1M = T1C + T1D;
Chris@82 780 }
Chris@82 781 cr[WS(rs, 14)] = FNMS(T1B, T1E, T1x * T1A);
Chris@82 782 ci[WS(rs, 14)] = FMA(T1x, T1E, T1B * T1A);
Chris@82 783 cr[WS(rs, 6)] = FNMS(T1L, T1M, T1H * T1I);
Chris@82 784 ci[WS(rs, 6)] = FMA(T1H, T1M, T1L * T1I);
Chris@82 785 }
Chris@82 786 {
Chris@82 787 E T2C, T2S, T2Q, T2U;
Chris@82 788 {
Chris@82 789 E T2m, T2B, T2M, T2P;
Chris@82 790 T2m = T2e - T2l;
Chris@82 791 T2B = T2t - T2A;
Chris@82 792 T2C = T2m - T2B;
Chris@82 793 T2S = T2m + T2B;
Chris@82 794 T2M = T2I - T2L;
Chris@82 795 T2P = T2N - T2O;
Chris@82 796 T2Q = T2M - T2P;
Chris@82 797 T2U = T2M + T2P;
Chris@82 798 }
Chris@82 799 cr[WS(rs, 13)] = FNMS(T2F, T2Q, T2b * T2C);
Chris@82 800 ci[WS(rs, 13)] = FMA(T2F, T2C, T2b * T2Q);
Chris@82 801 cr[WS(rs, 5)] = FNMS(T2T, T2U, T2R * T2S);
Chris@82 802 ci[WS(rs, 5)] = FMA(T2T, T2S, T2R * T2U);
Chris@82 803 }
Chris@82 804 {
Chris@82 805 E T2X, T31, T30, T32;
Chris@82 806 {
Chris@82 807 E T2V, T2W, T2Y, T2Z;
Chris@82 808 T2V = T2e + T2l;
Chris@82 809 T2W = T2N + T2O;
Chris@82 810 T2X = T2V - T2W;
Chris@82 811 T31 = T2V + T2W;
Chris@82 812 T2Y = T2I + T2L;
Chris@82 813 T2Z = T2A + T2t;
Chris@82 814 T30 = T2Y - T2Z;
Chris@82 815 T32 = T2Y + T2Z;
Chris@82 816 }
Chris@82 817 cr[WS(rs, 9)] = FNMS(Tz, T30, Tw * T2X);
Chris@82 818 ci[WS(rs, 9)] = FMA(Tw, T30, Tz * T2X);
Chris@82 819 cr[WS(rs, 1)] = FNMS(Ty, T32, Tv * T31);
Chris@82 820 ci[WS(rs, 1)] = FMA(Tv, T32, Ty * T31);
Chris@82 821 }
Chris@82 822 {
Chris@82 823 E T20, T26, T24, T28;
Chris@82 824 {
Chris@82 825 E T1Y, T1Z, T22, T23;
Chris@82 826 T1Y = T7 - Te;
Chris@82 827 T1Z = T1U - T1T;
Chris@82 828 T20 = T1Y - T1Z;
Chris@82 829 T26 = T1Y + T1Z;
Chris@82 830 T22 = T1Q - T1R;
Chris@82 831 T23 = Tm - Tt;
Chris@82 832 T24 = T22 - T23;
Chris@82 833 T28 = T23 + T22;
Chris@82 834 }
Chris@82 835 cr[WS(rs, 12)] = FNMS(T21, T24, T1X * T20);
Chris@82 836 ci[WS(rs, 12)] = FMA(T1X, T24, T21 * T20);
Chris@82 837 cr[WS(rs, 4)] = FNMS(T27, T28, T25 * T26);
Chris@82 838 ci[WS(rs, 4)] = FMA(T25, T28, T27 * T26);
Chris@82 839 }
Chris@82 840 }
Chris@82 841 }
Chris@82 842 }
Chris@82 843 }
Chris@82 844
Chris@82 845 static const tw_instr twinstr[] = {
Chris@82 846 {TW_CEXP, 1, 1},
Chris@82 847 {TW_CEXP, 1, 3},
Chris@82 848 {TW_CEXP, 1, 9},
Chris@82 849 {TW_CEXP, 1, 15},
Chris@82 850 {TW_NEXT, 1, 0}
Chris@82 851 };
Chris@82 852
Chris@82 853 static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, {156, 68, 40, 0} };
Chris@82 854
Chris@82 855 void X(codelet_hb2_16) (planner *p) {
Chris@82 856 X(khc2hc_register) (p, hb2_16, &desc);
Chris@82 857 }
Chris@82 858 #endif