annotate src/fftw-3.3.8/rdft/scalar/r2cb/hc2cb2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:55 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include rdft/scalar/hc2cb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@82 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@82 33 * 93 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cb.h"
Chris@82 36
Chris@82 37 static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 41 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 45 E Tv, Tw, T2z, T2C, TB, TF, Ty, Tz, T1V, TA, T2G, T3Q, T3C, T3g, T3L;
Chris@82 46 E T30, T3m, T3z, T3w, T3s, T1X, T1Y, T2u, T2c, T2p, TE, TG, T1G, T1o, T1D;
Chris@82 47 {
Chris@82 48 E T3f, T3l, T2F, T3r, T2Z, T3v, TD, Tx;
Chris@82 49 Tv = W[0];
Chris@82 50 Tw = W[2];
Chris@82 51 Tx = Tv * Tw;
Chris@82 52 T2z = W[6];
Chris@82 53 T3f = Tv * T2z;
Chris@82 54 T2C = W[7];
Chris@82 55 T3l = Tv * T2C;
Chris@82 56 TB = W[4];
Chris@82 57 T2F = Tv * TB;
Chris@82 58 T3r = Tw * TB;
Chris@82 59 TF = W[5];
Chris@82 60 T2Z = Tv * TF;
Chris@82 61 T3v = Tw * TF;
Chris@82 62 Ty = W[1];
Chris@82 63 Tz = W[3];
Chris@82 64 TD = Tv * Tz;
Chris@82 65 T1V = FMA(Ty, Tz, Tx);
Chris@82 66 TA = FNMS(Ty, Tz, Tx);
Chris@82 67 T2G = FNMS(Ty, TF, T2F);
Chris@82 68 T3Q = FMA(Tz, TB, T3v);
Chris@82 69 T3C = FNMS(Ty, TB, T2Z);
Chris@82 70 T3g = FMA(Ty, T2C, T3f);
Chris@82 71 T3L = FNMS(Tz, TF, T3r);
Chris@82 72 T30 = FMA(Ty, TB, T2Z);
Chris@82 73 T3m = FNMS(Ty, T2z, T3l);
Chris@82 74 T3z = FMA(Ty, TF, T2F);
Chris@82 75 T3w = FNMS(Tz, TB, T3v);
Chris@82 76 T3s = FMA(Tz, TF, T3r);
Chris@82 77 {
Chris@82 78 E T1W, T2b, TC, T1n;
Chris@82 79 T1W = T1V * TB;
Chris@82 80 T2b = T1V * TF;
Chris@82 81 T1X = FNMS(Ty, Tw, TD);
Chris@82 82 T1Y = FNMS(T1X, TF, T1W);
Chris@82 83 T2u = FNMS(T1X, TB, T2b);
Chris@82 84 T2c = FMA(T1X, TB, T2b);
Chris@82 85 T2p = FMA(T1X, TF, T1W);
Chris@82 86 TC = TA * TB;
Chris@82 87 T1n = TA * TF;
Chris@82 88 TE = FMA(Ty, Tw, TD);
Chris@82 89 TG = FNMS(TE, TF, TC);
Chris@82 90 T1G = FNMS(TE, TB, T1n);
Chris@82 91 T1o = FMA(TE, TB, T1n);
Chris@82 92 T1D = FMA(TE, TF, TC);
Chris@82 93 }
Chris@82 94 }
Chris@82 95 {
Chris@82 96 E TL, T1Z, T2d, T1t, T31, T34, T3n, T3D, T3E, T3R, T1w, T20, Tf, T3M, T2L;
Chris@82 97 E T3h, TW, T2e, T3G, T3H, T3N, T2Q, T36, T2V, T37, Tu, T3S, T18, T1z, T24;
Chris@82 98 E T2g, T27, T2h, T1j, T1y;
Chris@82 99 {
Chris@82 100 E T3, TH, T1s, T32, T6, T1p, TK, T33, Ta, TM, TP, T2J, Td, TR, TU;
Chris@82 101 E T2I;
Chris@82 102 {
Chris@82 103 E T1, T2, T1q, T1r;
Chris@82 104 T1 = Rp[0];
Chris@82 105 T2 = Rm[WS(rs, 7)];
Chris@82 106 T3 = T1 + T2;
Chris@82 107 TH = T1 - T2;
Chris@82 108 T1q = Ip[0];
Chris@82 109 T1r = Im[WS(rs, 7)];
Chris@82 110 T1s = T1q + T1r;
Chris@82 111 T32 = T1q - T1r;
Chris@82 112 }
Chris@82 113 {
Chris@82 114 E T4, T5, TI, TJ;
Chris@82 115 T4 = Rp[WS(rs, 4)];
Chris@82 116 T5 = Rm[WS(rs, 3)];
Chris@82 117 T6 = T4 + T5;
Chris@82 118 T1p = T4 - T5;
Chris@82 119 TI = Ip[WS(rs, 4)];
Chris@82 120 TJ = Im[WS(rs, 3)];
Chris@82 121 TK = TI + TJ;
Chris@82 122 T33 = TI - TJ;
Chris@82 123 }
Chris@82 124 {
Chris@82 125 E T8, T9, TN, TO;
Chris@82 126 T8 = Rp[WS(rs, 2)];
Chris@82 127 T9 = Rm[WS(rs, 5)];
Chris@82 128 Ta = T8 + T9;
Chris@82 129 TM = T8 - T9;
Chris@82 130 TN = Ip[WS(rs, 2)];
Chris@82 131 TO = Im[WS(rs, 5)];
Chris@82 132 TP = TN + TO;
Chris@82 133 T2J = TN - TO;
Chris@82 134 }
Chris@82 135 {
Chris@82 136 E Tb, Tc, TS, TT;
Chris@82 137 Tb = Rm[WS(rs, 1)];
Chris@82 138 Tc = Rp[WS(rs, 6)];
Chris@82 139 Td = Tb + Tc;
Chris@82 140 TR = Tb - Tc;
Chris@82 141 TS = Ip[WS(rs, 6)];
Chris@82 142 TT = Im[WS(rs, 1)];
Chris@82 143 TU = TS + TT;
Chris@82 144 T2I = TS - TT;
Chris@82 145 }
Chris@82 146 TL = TH - TK;
Chris@82 147 T1Z = TH + TK;
Chris@82 148 T2d = T1s - T1p;
Chris@82 149 T1t = T1p + T1s;
Chris@82 150 T31 = Ta - Td;
Chris@82 151 T34 = T32 - T33;
Chris@82 152 T3n = T34 - T31;
Chris@82 153 {
Chris@82 154 E T1u, T1v, T7, Te;
Chris@82 155 T3D = T32 + T33;
Chris@82 156 T3E = T2J + T2I;
Chris@82 157 T3R = T3D - T3E;
Chris@82 158 T1u = TM + TP;
Chris@82 159 T1v = TR + TU;
Chris@82 160 T1w = T1u - T1v;
Chris@82 161 T20 = T1u + T1v;
Chris@82 162 T7 = T3 + T6;
Chris@82 163 Te = Ta + Td;
Chris@82 164 Tf = T7 + Te;
Chris@82 165 T3M = T7 - Te;
Chris@82 166 {
Chris@82 167 E T2H, T2K, TQ, TV;
Chris@82 168 T2H = T3 - T6;
Chris@82 169 T2K = T2I - T2J;
Chris@82 170 T2L = T2H + T2K;
Chris@82 171 T3h = T2H - T2K;
Chris@82 172 TQ = TM - TP;
Chris@82 173 TV = TR - TU;
Chris@82 174 TW = TQ + TV;
Chris@82 175 T2e = TQ - TV;
Chris@82 176 }
Chris@82 177 }
Chris@82 178 }
Chris@82 179 {
Chris@82 180 E Ti, T1e, T1c, T2N, Tl, T19, T1h, T2O, Tp, T13, T11, T2S, Ts, TY, T16;
Chris@82 181 E T2T, T2M, T2P;
Chris@82 182 {
Chris@82 183 E Tg, Th, T1a, T1b;
Chris@82 184 Tg = Rp[WS(rs, 1)];
Chris@82 185 Th = Rm[WS(rs, 6)];
Chris@82 186 Ti = Tg + Th;
Chris@82 187 T1e = Tg - Th;
Chris@82 188 T1a = Ip[WS(rs, 1)];
Chris@82 189 T1b = Im[WS(rs, 6)];
Chris@82 190 T1c = T1a + T1b;
Chris@82 191 T2N = T1a - T1b;
Chris@82 192 }
Chris@82 193 {
Chris@82 194 E Tj, Tk, T1f, T1g;
Chris@82 195 Tj = Rp[WS(rs, 5)];
Chris@82 196 Tk = Rm[WS(rs, 2)];
Chris@82 197 Tl = Tj + Tk;
Chris@82 198 T19 = Tj - Tk;
Chris@82 199 T1f = Ip[WS(rs, 5)];
Chris@82 200 T1g = Im[WS(rs, 2)];
Chris@82 201 T1h = T1f + T1g;
Chris@82 202 T2O = T1f - T1g;
Chris@82 203 }
Chris@82 204 {
Chris@82 205 E Tn, To, TZ, T10;
Chris@82 206 Tn = Rm[0];
Chris@82 207 To = Rp[WS(rs, 7)];
Chris@82 208 Tp = Tn + To;
Chris@82 209 T13 = Tn - To;
Chris@82 210 TZ = Ip[WS(rs, 7)];
Chris@82 211 T10 = Im[0];
Chris@82 212 T11 = TZ + T10;
Chris@82 213 T2S = TZ - T10;
Chris@82 214 }
Chris@82 215 {
Chris@82 216 E Tq, Tr, T14, T15;
Chris@82 217 Tq = Rp[WS(rs, 3)];
Chris@82 218 Tr = Rm[WS(rs, 4)];
Chris@82 219 Ts = Tq + Tr;
Chris@82 220 TY = Tq - Tr;
Chris@82 221 T14 = Ip[WS(rs, 3)];
Chris@82 222 T15 = Im[WS(rs, 4)];
Chris@82 223 T16 = T14 + T15;
Chris@82 224 T2T = T14 - T15;
Chris@82 225 }
Chris@82 226 T3G = T2N + T2O;
Chris@82 227 T3H = T2S + T2T;
Chris@82 228 T3N = T3H - T3G;
Chris@82 229 T2M = Ti - Tl;
Chris@82 230 T2P = T2N - T2O;
Chris@82 231 T2Q = T2M - T2P;
Chris@82 232 T36 = T2M + T2P;
Chris@82 233 {
Chris@82 234 E T2R, T2U, Tm, Tt;
Chris@82 235 T2R = Tp - Ts;
Chris@82 236 T2U = T2S - T2T;
Chris@82 237 T2V = T2R + T2U;
Chris@82 238 T37 = T2U - T2R;
Chris@82 239 Tm = Ti + Tl;
Chris@82 240 Tt = Tp + Ts;
Chris@82 241 Tu = Tm + Tt;
Chris@82 242 T3S = Tm - Tt;
Chris@82 243 }
Chris@82 244 {
Chris@82 245 E T12, T17, T22, T23;
Chris@82 246 T12 = TY - T11;
Chris@82 247 T17 = T13 - T16;
Chris@82 248 T18 = FNMS(KP414213562, T17, T12);
Chris@82 249 T1z = FMA(KP414213562, T12, T17);
Chris@82 250 T22 = T1c - T19;
Chris@82 251 T23 = T1e + T1h;
Chris@82 252 T24 = FNMS(KP414213562, T23, T22);
Chris@82 253 T2g = FMA(KP414213562, T22, T23);
Chris@82 254 }
Chris@82 255 {
Chris@82 256 E T25, T26, T1d, T1i;
Chris@82 257 T25 = TY + T11;
Chris@82 258 T26 = T13 + T16;
Chris@82 259 T27 = FNMS(KP414213562, T26, T25);
Chris@82 260 T2h = FMA(KP414213562, T25, T26);
Chris@82 261 T1d = T19 + T1c;
Chris@82 262 T1i = T1e - T1h;
Chris@82 263 T1j = FMA(KP414213562, T1i, T1d);
Chris@82 264 T1y = FNMS(KP414213562, T1d, T1i);
Chris@82 265 }
Chris@82 266 }
Chris@82 267 Rp[0] = Tf + Tu;
Chris@82 268 {
Chris@82 269 E T3B, T3K, T3F, T3I, T3J, T3A;
Chris@82 270 T3A = Tf - Tu;
Chris@82 271 T3B = T3z * T3A;
Chris@82 272 T3K = T3C * T3A;
Chris@82 273 T3F = T3D + T3E;
Chris@82 274 T3I = T3G + T3H;
Chris@82 275 T3J = T3F - T3I;
Chris@82 276 Rm[0] = T3F + T3I;
Chris@82 277 Rm[WS(rs, 4)] = FMA(T3z, T3J, T3K);
Chris@82 278 Rp[WS(rs, 4)] = FNMS(T3C, T3J, T3B);
Chris@82 279 }
Chris@82 280 {
Chris@82 281 E T3O, T3P, T3T, T3U;
Chris@82 282 T3O = T3M - T3N;
Chris@82 283 T3P = T3L * T3O;
Chris@82 284 T3T = T3R - T3S;
Chris@82 285 T3U = T3L * T3T;
Chris@82 286 Rp[WS(rs, 6)] = FNMS(T3Q, T3T, T3P);
Chris@82 287 Rm[WS(rs, 6)] = FMA(T3Q, T3O, T3U);
Chris@82 288 }
Chris@82 289 {
Chris@82 290 E T3V, T3W, T3X, T3Y;
Chris@82 291 T3V = T3M + T3N;
Chris@82 292 T3W = TA * T3V;
Chris@82 293 T3X = T3S + T3R;
Chris@82 294 T3Y = TA * T3X;
Chris@82 295 Rp[WS(rs, 2)] = FNMS(TE, T3X, T3W);
Chris@82 296 Rm[WS(rs, 2)] = FMA(TE, T3V, T3Y);
Chris@82 297 }
Chris@82 298 {
Chris@82 299 E T3j, T3t, T3p, T3x, T3i, T3o;
Chris@82 300 T3i = T37 - T36;
Chris@82 301 T3j = FNMS(KP707106781, T3i, T3h);
Chris@82 302 T3t = FMA(KP707106781, T3i, T3h);
Chris@82 303 T3o = T2Q - T2V;
Chris@82 304 T3p = FNMS(KP707106781, T3o, T3n);
Chris@82 305 T3x = FMA(KP707106781, T3o, T3n);
Chris@82 306 {
Chris@82 307 E T3k, T3q, T3u, T3y;
Chris@82 308 T3k = T3g * T3j;
Chris@82 309 Rp[WS(rs, 7)] = FNMS(T3m, T3p, T3k);
Chris@82 310 T3q = T3g * T3p;
Chris@82 311 Rm[WS(rs, 7)] = FMA(T3m, T3j, T3q);
Chris@82 312 T3u = T3s * T3t;
Chris@82 313 Rp[WS(rs, 3)] = FNMS(T3w, T3x, T3u);
Chris@82 314 T3y = T3s * T3x;
Chris@82 315 Rm[WS(rs, 3)] = FMA(T3w, T3t, T3y);
Chris@82 316 }
Chris@82 317 }
Chris@82 318 {
Chris@82 319 E T2X, T3b, T39, T3d, T2W, T35, T38;
Chris@82 320 T2W = T2Q + T2V;
Chris@82 321 T2X = FNMS(KP707106781, T2W, T2L);
Chris@82 322 T3b = FMA(KP707106781, T2W, T2L);
Chris@82 323 T35 = T31 + T34;
Chris@82 324 T38 = T36 + T37;
Chris@82 325 T39 = FNMS(KP707106781, T38, T35);
Chris@82 326 T3d = FMA(KP707106781, T38, T35);
Chris@82 327 {
Chris@82 328 E T2Y, T3a, T3c, T3e;
Chris@82 329 T2Y = T2G * T2X;
Chris@82 330 Rp[WS(rs, 5)] = FNMS(T30, T39, T2Y);
Chris@82 331 T3a = T30 * T2X;
Chris@82 332 Rm[WS(rs, 5)] = FMA(T2G, T39, T3a);
Chris@82 333 T3c = T1V * T3b;
Chris@82 334 Rp[WS(rs, 1)] = FNMS(T1X, T3d, T3c);
Chris@82 335 T3e = T1X * T3b;
Chris@82 336 Rm[WS(rs, 1)] = FMA(T1V, T3d, T3e);
Chris@82 337 }
Chris@82 338 }
Chris@82 339 {
Chris@82 340 E T29, T2l, T2j, T2n;
Chris@82 341 {
Chris@82 342 E T21, T28, T2f, T2i;
Chris@82 343 T21 = FNMS(KP707106781, T20, T1Z);
Chris@82 344 T28 = T24 + T27;
Chris@82 345 T29 = FMA(KP923879532, T28, T21);
Chris@82 346 T2l = FNMS(KP923879532, T28, T21);
Chris@82 347 T2f = FMA(KP707106781, T2e, T2d);
Chris@82 348 T2i = T2g - T2h;
Chris@82 349 T2j = FNMS(KP923879532, T2i, T2f);
Chris@82 350 T2n = FMA(KP923879532, T2i, T2f);
Chris@82 351 }
Chris@82 352 {
Chris@82 353 E T2a, T2k, T2m, T2o;
Chris@82 354 T2a = T1Y * T29;
Chris@82 355 Ip[WS(rs, 5)] = FNMS(T2c, T2j, T2a);
Chris@82 356 T2k = T2c * T29;
Chris@82 357 Im[WS(rs, 5)] = FMA(T1Y, T2j, T2k);
Chris@82 358 T2m = Tw * T2l;
Chris@82 359 Ip[WS(rs, 1)] = FNMS(Tz, T2n, T2m);
Chris@82 360 T2o = Tz * T2l;
Chris@82 361 Im[WS(rs, 1)] = FMA(Tw, T2n, T2o);
Chris@82 362 }
Chris@82 363 }
Chris@82 364 {
Chris@82 365 E T1l, T1E, T1B, T1H;
Chris@82 366 {
Chris@82 367 E TX, T1k, T1x, T1A;
Chris@82 368 TX = FNMS(KP707106781, TW, TL);
Chris@82 369 T1k = T18 - T1j;
Chris@82 370 T1l = FNMS(KP923879532, T1k, TX);
Chris@82 371 T1E = FMA(KP923879532, T1k, TX);
Chris@82 372 T1x = FNMS(KP707106781, T1w, T1t);
Chris@82 373 T1A = T1y - T1z;
Chris@82 374 T1B = FNMS(KP923879532, T1A, T1x);
Chris@82 375 T1H = FMA(KP923879532, T1A, T1x);
Chris@82 376 }
Chris@82 377 {
Chris@82 378 E T1m, T1C, T1F, T1I;
Chris@82 379 T1m = TG * T1l;
Chris@82 380 Ip[WS(rs, 6)] = FNMS(T1o, T1B, T1m);
Chris@82 381 T1C = T1o * T1l;
Chris@82 382 Im[WS(rs, 6)] = FMA(TG, T1B, T1C);
Chris@82 383 T1F = T1D * T1E;
Chris@82 384 Ip[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
Chris@82 385 T1I = T1G * T1E;
Chris@82 386 Im[WS(rs, 2)] = FMA(T1D, T1H, T1I);
Chris@82 387 }
Chris@82 388 }
Chris@82 389 {
Chris@82 390 E T2s, T2A, T2x, T2D;
Chris@82 391 {
Chris@82 392 E T2q, T2r, T2v, T2w;
Chris@82 393 T2q = FMA(KP707106781, T20, T1Z);
Chris@82 394 T2r = T2g + T2h;
Chris@82 395 T2s = FNMS(KP923879532, T2r, T2q);
Chris@82 396 T2A = FMA(KP923879532, T2r, T2q);
Chris@82 397 T2v = FNMS(KP707106781, T2e, T2d);
Chris@82 398 T2w = T27 - T24;
Chris@82 399 T2x = FMA(KP923879532, T2w, T2v);
Chris@82 400 T2D = FNMS(KP923879532, T2w, T2v);
Chris@82 401 }
Chris@82 402 {
Chris@82 403 E T2t, T2y, T2B, T2E;
Chris@82 404 T2t = T2p * T2s;
Chris@82 405 Ip[WS(rs, 3)] = FNMS(T2u, T2x, T2t);
Chris@82 406 T2y = T2p * T2x;
Chris@82 407 Im[WS(rs, 3)] = FMA(T2u, T2s, T2y);
Chris@82 408 T2B = T2z * T2A;
Chris@82 409 Ip[WS(rs, 7)] = FNMS(T2C, T2D, T2B);
Chris@82 410 T2E = T2z * T2D;
Chris@82 411 Im[WS(rs, 7)] = FMA(T2C, T2A, T2E);
Chris@82 412 }
Chris@82 413 }
Chris@82 414 {
Chris@82 415 E T1L, T1R, T1P, T1T;
Chris@82 416 {
Chris@82 417 E T1J, T1K, T1N, T1O;
Chris@82 418 T1J = FMA(KP707106781, TW, TL);
Chris@82 419 T1K = T1y + T1z;
Chris@82 420 T1L = FNMS(KP923879532, T1K, T1J);
Chris@82 421 T1R = FMA(KP923879532, T1K, T1J);
Chris@82 422 T1N = FMA(KP707106781, T1w, T1t);
Chris@82 423 T1O = T1j + T18;
Chris@82 424 T1P = FNMS(KP923879532, T1O, T1N);
Chris@82 425 T1T = FMA(KP923879532, T1O, T1N);
Chris@82 426 }
Chris@82 427 {
Chris@82 428 E T1M, T1Q, T1S, T1U;
Chris@82 429 T1M = TB * T1L;
Chris@82 430 Ip[WS(rs, 4)] = FNMS(TF, T1P, T1M);
Chris@82 431 T1Q = TB * T1P;
Chris@82 432 Im[WS(rs, 4)] = FMA(TF, T1L, T1Q);
Chris@82 433 T1S = Tv * T1R;
Chris@82 434 Ip[0] = FNMS(Ty, T1T, T1S);
Chris@82 435 T1U = Tv * T1T;
Chris@82 436 Im[0] = FMA(Ty, T1R, T1U);
Chris@82 437 }
Chris@82 438 }
Chris@82 439 }
Chris@82 440 }
Chris@82 441 }
Chris@82 442 }
Chris@82 443
Chris@82 444 static const tw_instr twinstr[] = {
Chris@82 445 {TW_CEXP, 1, 1},
Chris@82 446 {TW_CEXP, 1, 3},
Chris@82 447 {TW_CEXP, 1, 9},
Chris@82 448 {TW_CEXP, 1, 15},
Chris@82 449 {TW_NEXT, 1, 0}
Chris@82 450 };
Chris@82 451
Chris@82 452 static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, {104, 42, 92, 0} };
Chris@82 453
Chris@82 454 void X(codelet_hc2cb2_16) (planner *p) {
Chris@82 455 X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
Chris@82 456 }
Chris@82 457 #else
Chris@82 458
Chris@82 459 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include rdft/scalar/hc2cb.h */
Chris@82 460
Chris@82 461 /*
Chris@82 462 * This function contains 196 FP additions, 108 FP multiplications,
Chris@82 463 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@82 464 * 80 stack variables, 3 constants, and 64 memory accesses
Chris@82 465 */
Chris@82 466 #include "rdft/scalar/hc2cb.h"
Chris@82 467
Chris@82 468 static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 469 {
Chris@82 470 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 471 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 472 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 473 {
Chris@82 474 INT m;
Chris@82 475 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 476 E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
Chris@82 477 E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
Chris@82 478 {
Chris@82 479 E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
Chris@82 480 {
Chris@82 481 E T1m, T1s, T1o, T1r;
Chris@82 482 Tv = W[0];
Chris@82 483 Ty = W[1];
Chris@82 484 T1l = W[2];
Chris@82 485 T1n = W[3];
Chris@82 486 T1m = Tv * T1l;
Chris@82 487 T1s = Ty * T1l;
Chris@82 488 T1o = Ty * T1n;
Chris@82 489 T1r = Tv * T1n;
Chris@82 490 T1p = T1m + T1o;
Chris@82 491 T1t = T1r - T1s;
Chris@82 492 T27 = T1r + T1s;
Chris@82 493 T25 = T1m - T1o;
Chris@82 494 Tz = W[5];
Chris@82 495 TA = Ty * Tz;
Chris@82 496 T1J = T1l * Tz;
Chris@82 497 T15 = Tv * Tz;
Chris@82 498 T1G = T1n * Tz;
Chris@82 499 Tw = W[4];
Chris@82 500 Tx = Tv * Tw;
Chris@82 501 T1K = T1n * Tw;
Chris@82 502 T16 = Ty * Tw;
Chris@82 503 T1F = T1l * Tw;
Chris@82 504 }
Chris@82 505 TB = Tx - TA;
Chris@82 506 T21 = T1J + T1K;
Chris@82 507 T1P = T15 - T16;
Chris@82 508 T1H = T1F + T1G;
Chris@82 509 T1X = T1F - T1G;
Chris@82 510 T17 = T15 + T16;
Chris@82 511 T1L = T1J - T1K;
Chris@82 512 T1N = Tx + TA;
Chris@82 513 T1v = W[6];
Chris@82 514 T1w = W[7];
Chris@82 515 T1x = FMA(Tv, T1v, Ty * T1w);
Chris@82 516 T1B = FNMS(Ty, T1v, Tv * T1w);
Chris@82 517 {
Chris@82 518 E T2D, T2E, T29, T2a;
Chris@82 519 T2D = T25 * Tz;
Chris@82 520 T2E = T27 * Tw;
Chris@82 521 T2F = T2D + T2E;
Chris@82 522 T2T = T2D - T2E;
Chris@82 523 T29 = T25 * Tw;
Chris@82 524 T2a = T27 * Tz;
Chris@82 525 T2b = T29 - T2a;
Chris@82 526 T2R = T29 + T2a;
Chris@82 527 }
Chris@82 528 {
Chris@82 529 E T3h, T3i, T33, T34;
Chris@82 530 T3h = T1p * Tz;
Chris@82 531 T3i = T1t * Tw;
Chris@82 532 T3j = T3h + T3i;
Chris@82 533 T3x = T3h - T3i;
Chris@82 534 T33 = T1p * Tw;
Chris@82 535 T34 = T1t * Tz;
Chris@82 536 T35 = T33 - T34;
Chris@82 537 T3t = T33 + T34;
Chris@82 538 }
Chris@82 539 }
Chris@82 540 {
Chris@82 541 E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
Chris@82 542 E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
Chris@82 543 E T3e, T3o;
Chris@82 544 {
Chris@82 545 E T3, T2c, T1b, T2H, T6, T2G, T1e, T2d;
Chris@82 546 {
Chris@82 547 E T1, T2, T19, T1a;
Chris@82 548 T1 = Rp[0];
Chris@82 549 T2 = Rm[WS(rs, 7)];
Chris@82 550 T3 = T1 + T2;
Chris@82 551 T2c = T1 - T2;
Chris@82 552 T19 = Ip[0];
Chris@82 553 T1a = Im[WS(rs, 7)];
Chris@82 554 T1b = T19 - T1a;
Chris@82 555 T2H = T19 + T1a;
Chris@82 556 }
Chris@82 557 {
Chris@82 558 E T4, T5, T1c, T1d;
Chris@82 559 T4 = Rp[WS(rs, 4)];
Chris@82 560 T5 = Rm[WS(rs, 3)];
Chris@82 561 T6 = T4 + T5;
Chris@82 562 T2G = T4 - T5;
Chris@82 563 T1c = Ip[WS(rs, 4)];
Chris@82 564 T1d = Im[WS(rs, 3)];
Chris@82 565 T1e = T1c - T1d;
Chris@82 566 T2d = T1c + T1d;
Chris@82 567 }
Chris@82 568 T7 = T3 + T6;
Chris@82 569 T36 = T2c + T2d;
Chris@82 570 T3k = T2H - T2G;
Chris@82 571 TC = T3 - T6;
Chris@82 572 T1f = T1b - T1e;
Chris@82 573 T2e = T2c - T2d;
Chris@82 574 T2I = T2G + T2H;
Chris@82 575 T1Q = T1b + T1e;
Chris@82 576 }
Chris@82 577 {
Chris@82 578 E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
Chris@82 579 {
Chris@82 580 E T8, T9, TG, TH;
Chris@82 581 T8 = Rp[WS(rs, 2)];
Chris@82 582 T9 = Rm[WS(rs, 5)];
Chris@82 583 Ta = T8 + T9;
Chris@82 584 T2f = T8 - T9;
Chris@82 585 TG = Ip[WS(rs, 2)];
Chris@82 586 TH = Im[WS(rs, 5)];
Chris@82 587 TI = TG - TH;
Chris@82 588 T2g = TG + TH;
Chris@82 589 }
Chris@82 590 {
Chris@82 591 E Tb, Tc, TD, TE;
Chris@82 592 Tb = Rm[WS(rs, 1)];
Chris@82 593 Tc = Rp[WS(rs, 6)];
Chris@82 594 Td = Tb + Tc;
Chris@82 595 T2i = Tb - Tc;
Chris@82 596 TD = Ip[WS(rs, 6)];
Chris@82 597 TE = Im[WS(rs, 1)];
Chris@82 598 TF = TD - TE;
Chris@82 599 T2j = TD + TE;
Chris@82 600 }
Chris@82 601 Te = Ta + Td;
Chris@82 602 TJ = TF - TI;
Chris@82 603 T1R = TI + TF;
Chris@82 604 T18 = Ta - Td;
Chris@82 605 {
Chris@82 606 E T2J, T2K, T2h, T2k;
Chris@82 607 T2J = T2f + T2g;
Chris@82 608 T2K = T2i + T2j;
Chris@82 609 T2L = KP707106781 * (T2J - T2K);
Chris@82 610 T37 = KP707106781 * (T2J + T2K);
Chris@82 611 T2h = T2f - T2g;
Chris@82 612 T2k = T2i - T2j;
Chris@82 613 T2l = KP707106781 * (T2h + T2k);
Chris@82 614 T3l = KP707106781 * (T2h - T2k);
Chris@82 615 }
Chris@82 616 }
Chris@82 617 {
Chris@82 618 E Ti, T2x, TO, T2v, Tl, T2u, TR, T2y, TL, TS;
Chris@82 619 {
Chris@82 620 E Tg, Th, TM, TN;
Chris@82 621 Tg = Rp[WS(rs, 1)];
Chris@82 622 Th = Rm[WS(rs, 6)];
Chris@82 623 Ti = Tg + Th;
Chris@82 624 T2x = Tg - Th;
Chris@82 625 TM = Ip[WS(rs, 1)];
Chris@82 626 TN = Im[WS(rs, 6)];
Chris@82 627 TO = TM - TN;
Chris@82 628 T2v = TM + TN;
Chris@82 629 }
Chris@82 630 {
Chris@82 631 E Tj, Tk, TP, TQ;
Chris@82 632 Tj = Rp[WS(rs, 5)];
Chris@82 633 Tk = Rm[WS(rs, 2)];
Chris@82 634 Tl = Tj + Tk;
Chris@82 635 T2u = Tj - Tk;
Chris@82 636 TP = Ip[WS(rs, 5)];
Chris@82 637 TQ = Im[WS(rs, 2)];
Chris@82 638 TR = TP - TQ;
Chris@82 639 T2y = TP + TQ;
Chris@82 640 }
Chris@82 641 Tm = Ti + Tl;
Chris@82 642 T1T = TO + TR;
Chris@82 643 TL = Ti - Tl;
Chris@82 644 TS = TO - TR;
Chris@82 645 TT = TL - TS;
Chris@82 646 T1h = TL + TS;
Chris@82 647 {
Chris@82 648 E T2w, T2z, T39, T3a;
Chris@82 649 T2w = T2u + T2v;
Chris@82 650 T2z = T2x - T2y;
Chris@82 651 T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
Chris@82 652 T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
Chris@82 653 T39 = T2x + T2y;
Chris@82 654 T3a = T2v - T2u;
Chris@82 655 T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
Chris@82 656 T3n = FMA(KP382683432, T3a, KP923879532 * T39);
Chris@82 657 }
Chris@82 658 }
Chris@82 659 {
Chris@82 660 E Tp, T2q, TX, T2o, Ts, T2n, T10, T2r, TU, T11;
Chris@82 661 {
Chris@82 662 E Tn, To, TV, TW;
Chris@82 663 Tn = Rm[0];
Chris@82 664 To = Rp[WS(rs, 7)];
Chris@82 665 Tp = Tn + To;
Chris@82 666 T2q = Tn - To;
Chris@82 667 TV = Ip[WS(rs, 7)];
Chris@82 668 TW = Im[0];
Chris@82 669 TX = TV - TW;
Chris@82 670 T2o = TV + TW;
Chris@82 671 }
Chris@82 672 {
Chris@82 673 E Tq, Tr, TY, TZ;
Chris@82 674 Tq = Rp[WS(rs, 3)];
Chris@82 675 Tr = Rm[WS(rs, 4)];
Chris@82 676 Ts = Tq + Tr;
Chris@82 677 T2n = Tq - Tr;
Chris@82 678 TY = Ip[WS(rs, 3)];
Chris@82 679 TZ = Im[WS(rs, 4)];
Chris@82 680 T10 = TY - TZ;
Chris@82 681 T2r = TY + TZ;
Chris@82 682 }
Chris@82 683 Tt = Tp + Ts;
Chris@82 684 T1U = TX + T10;
Chris@82 685 TU = Tp - Ts;
Chris@82 686 T11 = TX - T10;
Chris@82 687 T12 = TU + T11;
Chris@82 688 T1i = T11 - TU;
Chris@82 689 {
Chris@82 690 E T2p, T2s, T3c, T3d;
Chris@82 691 T2p = T2n - T2o;
Chris@82 692 T2s = T2q - T2r;
Chris@82 693 T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
Chris@82 694 T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
Chris@82 695 T3c = T2q + T2r;
Chris@82 696 T3d = T2n + T2o;
Chris@82 697 T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
Chris@82 698 T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
Chris@82 699 }
Chris@82 700 }
Chris@82 701 {
Chris@82 702 E Tf, Tu, T1O, T1S, T1V, T1W;
Chris@82 703 Tf = T7 + Te;
Chris@82 704 Tu = Tm + Tt;
Chris@82 705 T1O = Tf - Tu;
Chris@82 706 T1S = T1Q + T1R;
Chris@82 707 T1V = T1T + T1U;
Chris@82 708 T1W = T1S - T1V;
Chris@82 709 Rp[0] = Tf + Tu;
Chris@82 710 Rm[0] = T1S + T1V;
Chris@82 711 Rp[WS(rs, 4)] = FNMS(T1P, T1W, T1N * T1O);
Chris@82 712 Rm[WS(rs, 4)] = FMA(T1P, T1O, T1N * T1W);
Chris@82 713 }
Chris@82 714 {
Chris@82 715 E T3g, T3r, T3q, T3s;
Chris@82 716 {
Chris@82 717 E T38, T3f, T3m, T3p;
Chris@82 718 T38 = T36 - T37;
Chris@82 719 T3f = T3b + T3e;
Chris@82 720 T3g = T38 - T3f;
Chris@82 721 T3r = T38 + T3f;
Chris@82 722 T3m = T3k + T3l;
Chris@82 723 T3p = T3n - T3o;
Chris@82 724 T3q = T3m - T3p;
Chris@82 725 T3s = T3m + T3p;
Chris@82 726 }
Chris@82 727 Ip[WS(rs, 5)] = FNMS(T3j, T3q, T35 * T3g);
Chris@82 728 Im[WS(rs, 5)] = FMA(T3j, T3g, T35 * T3q);
Chris@82 729 Ip[WS(rs, 1)] = FNMS(T1n, T3s, T1l * T3r);
Chris@82 730 Im[WS(rs, 1)] = FMA(T1n, T3r, T1l * T3s);
Chris@82 731 }
Chris@82 732 {
Chris@82 733 E T3w, T3B, T3A, T3C;
Chris@82 734 {
Chris@82 735 E T3u, T3v, T3y, T3z;
Chris@82 736 T3u = T36 + T37;
Chris@82 737 T3v = T3n + T3o;
Chris@82 738 T3w = T3u - T3v;
Chris@82 739 T3B = T3u + T3v;
Chris@82 740 T3y = T3k - T3l;
Chris@82 741 T3z = T3b - T3e;
Chris@82 742 T3A = T3y + T3z;
Chris@82 743 T3C = T3y - T3z;
Chris@82 744 }
Chris@82 745 Ip[WS(rs, 3)] = FNMS(T3x, T3A, T3t * T3w);
Chris@82 746 Im[WS(rs, 3)] = FMA(T3t, T3A, T3x * T3w);
Chris@82 747 Ip[WS(rs, 7)] = FNMS(T1w, T3C, T1v * T3B);
Chris@82 748 Im[WS(rs, 7)] = FMA(T1v, T3C, T1w * T3B);
Chris@82 749 }
Chris@82 750 {
Chris@82 751 E T14, T1q, T1k, T1u;
Chris@82 752 {
Chris@82 753 E TK, T13, T1g, T1j;
Chris@82 754 TK = TC + TJ;
Chris@82 755 T13 = KP707106781 * (TT + T12);
Chris@82 756 T14 = TK - T13;
Chris@82 757 T1q = TK + T13;
Chris@82 758 T1g = T18 + T1f;
Chris@82 759 T1j = KP707106781 * (T1h + T1i);
Chris@82 760 T1k = T1g - T1j;
Chris@82 761 T1u = T1g + T1j;
Chris@82 762 }
Chris@82 763 Rp[WS(rs, 5)] = FNMS(T17, T1k, TB * T14);
Chris@82 764 Rm[WS(rs, 5)] = FMA(T17, T14, TB * T1k);
Chris@82 765 Rp[WS(rs, 1)] = FNMS(T1t, T1u, T1p * T1q);
Chris@82 766 Rm[WS(rs, 1)] = FMA(T1t, T1q, T1p * T1u);
Chris@82 767 }
Chris@82 768 {
Chris@82 769 E T1A, T1I, T1E, T1M;
Chris@82 770 {
Chris@82 771 E T1y, T1z, T1C, T1D;
Chris@82 772 T1y = TC - TJ;
Chris@82 773 T1z = KP707106781 * (T1i - T1h);
Chris@82 774 T1A = T1y - T1z;
Chris@82 775 T1I = T1y + T1z;
Chris@82 776 T1C = T1f - T18;
Chris@82 777 T1D = KP707106781 * (TT - T12);
Chris@82 778 T1E = T1C - T1D;
Chris@82 779 T1M = T1C + T1D;
Chris@82 780 }
Chris@82 781 Rp[WS(rs, 7)] = FNMS(T1B, T1E, T1x * T1A);
Chris@82 782 Rm[WS(rs, 7)] = FMA(T1x, T1E, T1B * T1A);
Chris@82 783 Rp[WS(rs, 3)] = FNMS(T1L, T1M, T1H * T1I);
Chris@82 784 Rm[WS(rs, 3)] = FMA(T1H, T1M, T1L * T1I);
Chris@82 785 }
Chris@82 786 {
Chris@82 787 E T2C, T2S, T2Q, T2U;
Chris@82 788 {
Chris@82 789 E T2m, T2B, T2M, T2P;
Chris@82 790 T2m = T2e - T2l;
Chris@82 791 T2B = T2t - T2A;
Chris@82 792 T2C = T2m - T2B;
Chris@82 793 T2S = T2m + T2B;
Chris@82 794 T2M = T2I - T2L;
Chris@82 795 T2P = T2N - T2O;
Chris@82 796 T2Q = T2M - T2P;
Chris@82 797 T2U = T2M + T2P;
Chris@82 798 }
Chris@82 799 Ip[WS(rs, 6)] = FNMS(T2F, T2Q, T2b * T2C);
Chris@82 800 Im[WS(rs, 6)] = FMA(T2F, T2C, T2b * T2Q);
Chris@82 801 Ip[WS(rs, 2)] = FNMS(T2T, T2U, T2R * T2S);
Chris@82 802 Im[WS(rs, 2)] = FMA(T2T, T2S, T2R * T2U);
Chris@82 803 }
Chris@82 804 {
Chris@82 805 E T2X, T31, T30, T32;
Chris@82 806 {
Chris@82 807 E T2V, T2W, T2Y, T2Z;
Chris@82 808 T2V = T2e + T2l;
Chris@82 809 T2W = T2N + T2O;
Chris@82 810 T2X = T2V - T2W;
Chris@82 811 T31 = T2V + T2W;
Chris@82 812 T2Y = T2I + T2L;
Chris@82 813 T2Z = T2A + T2t;
Chris@82 814 T30 = T2Y - T2Z;
Chris@82 815 T32 = T2Y + T2Z;
Chris@82 816 }
Chris@82 817 Ip[WS(rs, 4)] = FNMS(Tz, T30, Tw * T2X);
Chris@82 818 Im[WS(rs, 4)] = FMA(Tw, T30, Tz * T2X);
Chris@82 819 Ip[0] = FNMS(Ty, T32, Tv * T31);
Chris@82 820 Im[0] = FMA(Tv, T32, Ty * T31);
Chris@82 821 }
Chris@82 822 {
Chris@82 823 E T20, T26, T24, T28;
Chris@82 824 {
Chris@82 825 E T1Y, T1Z, T22, T23;
Chris@82 826 T1Y = T7 - Te;
Chris@82 827 T1Z = T1U - T1T;
Chris@82 828 T20 = T1Y - T1Z;
Chris@82 829 T26 = T1Y + T1Z;
Chris@82 830 T22 = T1Q - T1R;
Chris@82 831 T23 = Tm - Tt;
Chris@82 832 T24 = T22 - T23;
Chris@82 833 T28 = T23 + T22;
Chris@82 834 }
Chris@82 835 Rp[WS(rs, 6)] = FNMS(T21, T24, T1X * T20);
Chris@82 836 Rm[WS(rs, 6)] = FMA(T1X, T24, T21 * T20);
Chris@82 837 Rp[WS(rs, 2)] = FNMS(T27, T28, T25 * T26);
Chris@82 838 Rm[WS(rs, 2)] = FMA(T25, T28, T27 * T26);
Chris@82 839 }
Chris@82 840 }
Chris@82 841 }
Chris@82 842 }
Chris@82 843 }
Chris@82 844
Chris@82 845 static const tw_instr twinstr[] = {
Chris@82 846 {TW_CEXP, 1, 1},
Chris@82 847 {TW_CEXP, 1, 3},
Chris@82 848 {TW_CEXP, 1, 9},
Chris@82 849 {TW_CEXP, 1, 15},
Chris@82 850 {TW_NEXT, 1, 0}
Chris@82 851 };
Chris@82 852
Chris@82 853 static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, {156, 68, 40, 0} };
Chris@82 854
Chris@82 855 void X(codelet_hc2cb2_16) (planner *p) {
Chris@82 856 X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
Chris@82 857 }
Chris@82 858 #endif