annotate src/fftw-3.3.8/rdft/scalar/r2cf/hc2cf2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:08 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include rdft/scalar/hc2cf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@82 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@82 33 * 90 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cf.h"
Chris@82 36
Chris@82 37 static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 45 E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
Chris@82 46 E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
Chris@82 47 {
Chris@82 48 E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
Chris@82 49 T2 = W[0];
Chris@82 50 Tf = W[2];
Chris@82 51 Tg = T2 * Tf;
Chris@82 52 TM = W[6];
Chris@82 53 TN = T2 * TM;
Chris@82 54 TO = W[7];
Chris@82 55 TS = T2 * TO;
Chris@82 56 T3 = W[4];
Chris@82 57 T4 = T2 * T3;
Chris@82 58 Tp = Tf * T3;
Chris@82 59 T6 = W[5];
Chris@82 60 Ta = T2 * T6;
Chris@82 61 Tt = Tf * T6;
Chris@82 62 T5 = W[1];
Chris@82 63 Th = W[3];
Chris@82 64 Tl = T2 * Th;
Chris@82 65 Tz = FMA(T5, Th, Tg);
Chris@82 66 Ti = FNMS(T5, Th, Tg);
Chris@82 67 T7 = FMA(T5, T6, T4);
Chris@82 68 TZ = FNMS(Th, T3, Tt);
Chris@82 69 TT = FNMS(T5, TM, TS);
Chris@82 70 Tq = FNMS(Th, T6, Tp);
Chris@82 71 TW = FMA(Th, T6, Tp);
Chris@82 72 Tb = FNMS(T5, T3, Ta);
Chris@82 73 Tu = FMA(Th, T3, Tt);
Chris@82 74 TP = FMA(T5, TO, TN);
Chris@82 75 TI = FMA(T5, T3, Ta);
Chris@82 76 TF = FNMS(T5, T6, T4);
Chris@82 77 {
Chris@82 78 E T1y, T1C, T1e, T1i;
Chris@82 79 T1y = Tz * T3;
Chris@82 80 T1C = Tz * T6;
Chris@82 81 TC = FNMS(T5, Tf, Tl);
Chris@82 82 T1z = FMA(TC, T6, T1y);
Chris@82 83 T1O = FMA(TC, T3, T1C);
Chris@82 84 T1D = FNMS(TC, T3, T1C);
Chris@82 85 T1L = FNMS(TC, T6, T1y);
Chris@82 86 T1e = Ti * T3;
Chris@82 87 T1i = Ti * T6;
Chris@82 88 Tm = FMA(T5, Tf, Tl);
Chris@82 89 T1f = FMA(Tm, T6, T1e);
Chris@82 90 T1p = FMA(Tm, T3, T1i);
Chris@82 91 T1j = FNMS(Tm, T3, T1i);
Chris@82 92 T1m = FNMS(Tm, T6, T1e);
Chris@82 93 }
Chris@82 94 }
Chris@82 95 {
Chris@82 96 E Te, T1U, T3A, T3L, T1G, T2D, T2B, T3h, T1R, T2w, T2I, T3i, Tx, T3M, T1Z;
Chris@82 97 E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
Chris@82 98 E T2d, T38;
Chris@82 99 {
Chris@82 100 E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
Chris@82 101 T1 = Rp[0];
Chris@82 102 T3z = Rm[0];
Chris@82 103 T8 = Rp[WS(rs, 4)];
Chris@82 104 T9 = T7 * T8;
Chris@82 105 Tc = Rm[WS(rs, 4)];
Chris@82 106 T3x = T7 * Tc;
Chris@82 107 Td = FMA(Tb, Tc, T9);
Chris@82 108 Te = T1 + Td;
Chris@82 109 T1U = T1 - Td;
Chris@82 110 T3y = FNMS(Tb, T8, T3x);
Chris@82 111 T3A = T3y + T3z;
Chris@82 112 T3L = T3z - T3y;
Chris@82 113 }
Chris@82 114 {
Chris@82 115 E T1u, T1v, T1w, T2x, T1A, T1B, T1E, T2z;
Chris@82 116 T1u = Ip[WS(rs, 7)];
Chris@82 117 T1v = TM * T1u;
Chris@82 118 T1w = Im[WS(rs, 7)];
Chris@82 119 T2x = TM * T1w;
Chris@82 120 T1A = Ip[WS(rs, 3)];
Chris@82 121 T1B = T1z * T1A;
Chris@82 122 T1E = Im[WS(rs, 3)];
Chris@82 123 T2z = T1z * T1E;
Chris@82 124 {
Chris@82 125 E T1x, T1F, T2y, T2A;
Chris@82 126 T1x = FMA(TO, T1w, T1v);
Chris@82 127 T1F = FMA(T1D, T1E, T1B);
Chris@82 128 T1G = T1x + T1F;
Chris@82 129 T2D = T1x - T1F;
Chris@82 130 T2y = FNMS(TO, T1u, T2x);
Chris@82 131 T2A = FNMS(T1D, T1A, T2z);
Chris@82 132 T2B = T2y - T2A;
Chris@82 133 T3h = T2y + T2A;
Chris@82 134 }
Chris@82 135 }
Chris@82 136 {
Chris@82 137 E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
Chris@82 138 T1H = Ip[WS(rs, 1)];
Chris@82 139 T1I = Tf * T1H;
Chris@82 140 T1J = Im[WS(rs, 1)];
Chris@82 141 T2E = Tf * T1J;
Chris@82 142 T1M = Ip[WS(rs, 5)];
Chris@82 143 T1N = T1L * T1M;
Chris@82 144 T1P = Im[WS(rs, 5)];
Chris@82 145 T2G = T1L * T1P;
Chris@82 146 {
Chris@82 147 E T1K, T1Q, T2F, T2H;
Chris@82 148 T1K = FMA(Th, T1J, T1I);
Chris@82 149 T1Q = FMA(T1O, T1P, T1N);
Chris@82 150 T1R = T1K + T1Q;
Chris@82 151 T2w = T1Q - T1K;
Chris@82 152 T2F = FNMS(Th, T1H, T2E);
Chris@82 153 T2H = FNMS(T1O, T1M, T2G);
Chris@82 154 T2I = T2F - T2H;
Chris@82 155 T3i = T2F + T2H;
Chris@82 156 }
Chris@82 157 }
Chris@82 158 {
Chris@82 159 E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
Chris@82 160 Tj = Rp[WS(rs, 2)];
Chris@82 161 Tk = Ti * Tj;
Chris@82 162 Tn = Rm[WS(rs, 2)];
Chris@82 163 T1V = Ti * Tn;
Chris@82 164 Tr = Rp[WS(rs, 6)];
Chris@82 165 Ts = Tq * Tr;
Chris@82 166 Tv = Rm[WS(rs, 6)];
Chris@82 167 T1X = Tq * Tv;
Chris@82 168 {
Chris@82 169 E To, Tw, T1W, T1Y;
Chris@82 170 To = FMA(Tm, Tn, Tk);
Chris@82 171 Tw = FMA(Tu, Tv, Ts);
Chris@82 172 Tx = To + Tw;
Chris@82 173 T3M = To - Tw;
Chris@82 174 T1W = FNMS(Tm, Tj, T1V);
Chris@82 175 T1Y = FNMS(Tu, Tr, T1X);
Chris@82 176 T1Z = T1W - T1Y;
Chris@82 177 T3w = T1W + T1Y;
Chris@82 178 }
Chris@82 179 }
Chris@82 180 {
Chris@82 181 E TA, TB, TD, T21, TG, TH, TJ, T23;
Chris@82 182 TA = Rp[WS(rs, 1)];
Chris@82 183 TB = Tz * TA;
Chris@82 184 TD = Rm[WS(rs, 1)];
Chris@82 185 T21 = Tz * TD;
Chris@82 186 TG = Rp[WS(rs, 5)];
Chris@82 187 TH = TF * TG;
Chris@82 188 TJ = Rm[WS(rs, 5)];
Chris@82 189 T23 = TF * TJ;
Chris@82 190 {
Chris@82 191 E TE, TK, T22, T24;
Chris@82 192 TE = FMA(TC, TD, TB);
Chris@82 193 TK = FMA(TI, TJ, TH);
Chris@82 194 TL = TE + TK;
Chris@82 195 T26 = TE - TK;
Chris@82 196 T22 = FNMS(TC, TA, T21);
Chris@82 197 T24 = FNMS(TI, TG, T23);
Chris@82 198 T25 = T22 - T24;
Chris@82 199 T37 = T22 + T24;
Chris@82 200 }
Chris@82 201 }
Chris@82 202 {
Chris@82 203 E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
Chris@82 204 T15 = Ip[0];
Chris@82 205 T16 = T2 * T15;
Chris@82 206 T17 = Im[0];
Chris@82 207 T2h = T2 * T17;
Chris@82 208 T19 = Ip[WS(rs, 4)];
Chris@82 209 T1a = T3 * T19;
Chris@82 210 T1b = Im[WS(rs, 4)];
Chris@82 211 T2j = T3 * T1b;
Chris@82 212 {
Chris@82 213 E T18, T1c, T2i, T2k;
Chris@82 214 T18 = FMA(T5, T17, T16);
Chris@82 215 T1c = FMA(T6, T1b, T1a);
Chris@82 216 T1d = T18 + T1c;
Chris@82 217 T2o = T18 - T1c;
Chris@82 218 T2i = FNMS(T5, T15, T2h);
Chris@82 219 T2k = FNMS(T6, T19, T2j);
Chris@82 220 T2l = T2i - T2k;
Chris@82 221 T3c = T2i + T2k;
Chris@82 222 }
Chris@82 223 }
Chris@82 224 {
Chris@82 225 E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
Chris@82 226 T1g = Ip[WS(rs, 2)];
Chris@82 227 T1h = T1f * T1g;
Chris@82 228 T1k = Im[WS(rs, 2)];
Chris@82 229 T2p = T1f * T1k;
Chris@82 230 T1n = Ip[WS(rs, 6)];
Chris@82 231 T1o = T1m * T1n;
Chris@82 232 T1q = Im[WS(rs, 6)];
Chris@82 233 T2r = T1m * T1q;
Chris@82 234 {
Chris@82 235 E T1l, T1r, T2q, T2s;
Chris@82 236 T1l = FMA(T1j, T1k, T1h);
Chris@82 237 T1r = FMA(T1p, T1q, T1o);
Chris@82 238 T1s = T1l + T1r;
Chris@82 239 T2m = T1l - T1r;
Chris@82 240 T2q = FNMS(T1j, T1g, T2p);
Chris@82 241 T2s = FNMS(T1p, T1n, T2r);
Chris@82 242 T2t = T2q - T2s;
Chris@82 243 T3d = T2q + T2s;
Chris@82 244 }
Chris@82 245 }
Chris@82 246 {
Chris@82 247 E TQ, TR, TU, T29, TX, TY, T10, T2b;
Chris@82 248 TQ = Rp[WS(rs, 7)];
Chris@82 249 TR = TP * TQ;
Chris@82 250 TU = Rm[WS(rs, 7)];
Chris@82 251 T29 = TP * TU;
Chris@82 252 TX = Rp[WS(rs, 3)];
Chris@82 253 TY = TW * TX;
Chris@82 254 T10 = Rm[WS(rs, 3)];
Chris@82 255 T2b = TW * T10;
Chris@82 256 {
Chris@82 257 E TV, T11, T2a, T2c;
Chris@82 258 TV = FMA(TT, TU, TR);
Chris@82 259 T11 = FMA(TZ, T10, TY);
Chris@82 260 T12 = TV + T11;
Chris@82 261 T28 = TV - T11;
Chris@82 262 T2a = FNMS(TT, TQ, T29);
Chris@82 263 T2c = FNMS(TZ, TX, T2b);
Chris@82 264 T2d = T2a - T2c;
Chris@82 265 T38 = T2a + T2c;
Chris@82 266 }
Chris@82 267 }
Chris@82 268 {
Chris@82 269 E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
Chris@82 270 {
Chris@82 271 E Ty, T13, T3v, T3B;
Chris@82 272 Ty = Te + Tx;
Chris@82 273 T13 = TL + T12;
Chris@82 274 T14 = Ty + T13;
Chris@82 275 T3q = Ty - T13;
Chris@82 276 T3v = T37 + T38;
Chris@82 277 T3B = T3w + T3A;
Chris@82 278 T3C = T3v + T3B;
Chris@82 279 T3E = T3B - T3v;
Chris@82 280 }
Chris@82 281 {
Chris@82 282 E T1t, T1S, T3r, T3s;
Chris@82 283 T1t = T1d + T1s;
Chris@82 284 T1S = T1G + T1R;
Chris@82 285 T1T = T1t + T1S;
Chris@82 286 T3D = T1S - T1t;
Chris@82 287 T3r = T3c + T3d;
Chris@82 288 T3s = T3h + T3i;
Chris@82 289 T3t = T3r - T3s;
Chris@82 290 T3u = T3r + T3s;
Chris@82 291 }
Chris@82 292 Rm[WS(rs, 7)] = T14 - T1T;
Chris@82 293 Im[WS(rs, 7)] = T3u - T3C;
Chris@82 294 Rp[0] = T14 + T1T;
Chris@82 295 Ip[0] = T3u + T3C;
Chris@82 296 Rm[WS(rs, 3)] = T3q - T3t;
Chris@82 297 Im[WS(rs, 3)] = T3D - T3E;
Chris@82 298 Rp[WS(rs, 4)] = T3q + T3t;
Chris@82 299 Ip[WS(rs, 4)] = T3D + T3E;
Chris@82 300 }
Chris@82 301 {
Chris@82 302 E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
Chris@82 303 {
Chris@82 304 E T36, T39, T3F, T3G;
Chris@82 305 T36 = Te - Tx;
Chris@82 306 T39 = T37 - T38;
Chris@82 307 T3a = T36 + T39;
Chris@82 308 T3m = T36 - T39;
Chris@82 309 T3F = T12 - TL;
Chris@82 310 T3G = T3A - T3w;
Chris@82 311 T3H = T3F + T3G;
Chris@82 312 T3J = T3G - T3F;
Chris@82 313 }
Chris@82 314 {
Chris@82 315 E T3b, T3e, T3g, T3j;
Chris@82 316 T3b = T1d - T1s;
Chris@82 317 T3e = T3c - T3d;
Chris@82 318 T3f = T3b + T3e;
Chris@82 319 T3n = T3e - T3b;
Chris@82 320 T3g = T1G - T1R;
Chris@82 321 T3j = T3h - T3i;
Chris@82 322 T3k = T3g - T3j;
Chris@82 323 T3o = T3g + T3j;
Chris@82 324 }
Chris@82 325 {
Chris@82 326 E T3l, T3I, T3p, T3K;
Chris@82 327 T3l = T3f + T3k;
Chris@82 328 Rm[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
Chris@82 329 Rp[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
Chris@82 330 T3I = T3n + T3o;
Chris@82 331 Im[WS(rs, 5)] = FMS(KP707106781, T3I, T3H);
Chris@82 332 Ip[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
Chris@82 333 T3p = T3n - T3o;
Chris@82 334 Rm[WS(rs, 1)] = FNMS(KP707106781, T3p, T3m);
Chris@82 335 Rp[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
Chris@82 336 T3K = T3k - T3f;
Chris@82 337 Im[WS(rs, 1)] = FMS(KP707106781, T3K, T3J);
Chris@82 338 Ip[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
Chris@82 339 }
Chris@82 340 }
Chris@82 341 {
Chris@82 342 E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
Chris@82 343 E T2O;
Chris@82 344 {
Chris@82 345 E T27, T2e, T2n, T2u;
Chris@82 346 T20 = T1U - T1Z;
Chris@82 347 T3N = T3L - T3M;
Chris@82 348 T3T = T3M + T3L;
Chris@82 349 T2Q = T1U + T1Z;
Chris@82 350 T27 = T25 - T26;
Chris@82 351 T2e = T28 + T2d;
Chris@82 352 T2f = T27 - T2e;
Chris@82 353 T3O = T27 + T2e;
Chris@82 354 {
Chris@82 355 E T2Y, T2Z, T2R, T2S;
Chris@82 356 T2Y = T2D + T2I;
Chris@82 357 T2Z = T2B + T2w;
Chris@82 358 T30 = FNMS(KP414213562, T2Z, T2Y);
Chris@82 359 T34 = FMA(KP414213562, T2Y, T2Z);
Chris@82 360 T2R = T26 + T25;
Chris@82 361 T2S = T28 - T2d;
Chris@82 362 T2T = T2R + T2S;
Chris@82 363 T3U = T2S - T2R;
Chris@82 364 }
Chris@82 365 T2n = T2l + T2m;
Chris@82 366 T2u = T2o - T2t;
Chris@82 367 T2v = FMA(KP414213562, T2u, T2n);
Chris@82 368 T2N = FNMS(KP414213562, T2n, T2u);
Chris@82 369 {
Chris@82 370 E T2V, T2W, T2C, T2J;
Chris@82 371 T2V = T2o + T2t;
Chris@82 372 T2W = T2l - T2m;
Chris@82 373 T2X = FMA(KP414213562, T2W, T2V);
Chris@82 374 T33 = FNMS(KP414213562, T2V, T2W);
Chris@82 375 T2C = T2w - T2B;
Chris@82 376 T2J = T2D - T2I;
Chris@82 377 T2K = FMA(KP414213562, T2J, T2C);
Chris@82 378 T2O = FNMS(KP414213562, T2C, T2J);
Chris@82 379 }
Chris@82 380 }
Chris@82 381 {
Chris@82 382 E T2g, T2L, T3V, T3W;
Chris@82 383 T2g = FMA(KP707106781, T2f, T20);
Chris@82 384 T2L = T2v + T2K;
Chris@82 385 Rm[WS(rs, 4)] = FNMS(KP923879532, T2L, T2g);
Chris@82 386 Rp[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
Chris@82 387 T3V = FMA(KP707106781, T3U, T3T);
Chris@82 388 T3W = T2O - T2N;
Chris@82 389 Im[WS(rs, 4)] = FMS(KP923879532, T3W, T3V);
Chris@82 390 Ip[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
Chris@82 391 }
Chris@82 392 {
Chris@82 393 E T2M, T2P, T3X, T3Y;
Chris@82 394 T2M = FNMS(KP707106781, T2f, T20);
Chris@82 395 T2P = T2N + T2O;
Chris@82 396 Rp[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
Chris@82 397 Rm[0] = FMA(KP923879532, T2P, T2M);
Chris@82 398 T3X = FNMS(KP707106781, T3U, T3T);
Chris@82 399 T3Y = T2K - T2v;
Chris@82 400 Im[0] = FMS(KP923879532, T3Y, T3X);
Chris@82 401 Ip[WS(rs, 7)] = FMA(KP923879532, T3Y, T3X);
Chris@82 402 }
Chris@82 403 {
Chris@82 404 E T2U, T31, T3P, T3Q;
Chris@82 405 T2U = FMA(KP707106781, T2T, T2Q);
Chris@82 406 T31 = T2X + T30;
Chris@82 407 Rm[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
Chris@82 408 Rp[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
Chris@82 409 T3P = FMA(KP707106781, T3O, T3N);
Chris@82 410 T3Q = T33 + T34;
Chris@82 411 Im[WS(rs, 6)] = FMS(KP923879532, T3Q, T3P);
Chris@82 412 Ip[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
Chris@82 413 }
Chris@82 414 {
Chris@82 415 E T32, T35, T3R, T3S;
Chris@82 416 T32 = FNMS(KP707106781, T2T, T2Q);
Chris@82 417 T35 = T33 - T34;
Chris@82 418 Rm[WS(rs, 2)] = FNMS(KP923879532, T35, T32);
Chris@82 419 Rp[WS(rs, 5)] = FMA(KP923879532, T35, T32);
Chris@82 420 T3R = FNMS(KP707106781, T3O, T3N);
Chris@82 421 T3S = T30 - T2X;
Chris@82 422 Im[WS(rs, 2)] = FMS(KP923879532, T3S, T3R);
Chris@82 423 Ip[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
Chris@82 424 }
Chris@82 425 }
Chris@82 426 }
Chris@82 427 }
Chris@82 428 }
Chris@82 429 }
Chris@82 430
Chris@82 431 static const tw_instr twinstr[] = {
Chris@82 432 {TW_CEXP, 1, 1},
Chris@82 433 {TW_CEXP, 1, 3},
Chris@82 434 {TW_CEXP, 1, 9},
Chris@82 435 {TW_CEXP, 1, 15},
Chris@82 436 {TW_NEXT, 1, 0}
Chris@82 437 };
Chris@82 438
Chris@82 439 static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, {104, 42, 92, 0} };
Chris@82 440
Chris@82 441 void X(codelet_hc2cf2_16) (planner *p) {
Chris@82 442 X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT);
Chris@82 443 }
Chris@82 444 #else
Chris@82 445
Chris@82 446 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include rdft/scalar/hc2cf.h */
Chris@82 447
Chris@82 448 /*
Chris@82 449 * This function contains 196 FP additions, 108 FP multiplications,
Chris@82 450 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@82 451 * 82 stack variables, 3 constants, and 64 memory accesses
Chris@82 452 */
Chris@82 453 #include "rdft/scalar/hc2cf.h"
Chris@82 454
Chris@82 455 static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 456 {
Chris@82 457 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 458 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 459 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 460 {
Chris@82 461 INT m;
Chris@82 462 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 463 E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
Chris@82 464 E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
Chris@82 465 {
Chris@82 466 E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
Chris@82 467 {
Chris@82 468 E Th, Tn, Tj, Tm;
Chris@82 469 T2 = W[0];
Chris@82 470 T5 = W[1];
Chris@82 471 Tg = W[2];
Chris@82 472 Ti = W[3];
Chris@82 473 Th = T2 * Tg;
Chris@82 474 Tn = T5 * Tg;
Chris@82 475 Tj = T5 * Ti;
Chris@82 476 Tm = T2 * Ti;
Chris@82 477 Tk = Th - Tj;
Chris@82 478 To = Tm + Tn;
Chris@82 479 TE = Tm - Tn;
Chris@82 480 TC = Th + Tj;
Chris@82 481 T6 = W[5];
Chris@82 482 T7 = T5 * T6;
Chris@82 483 Tv = Tg * T6;
Chris@82 484 Ta = T2 * T6;
Chris@82 485 Ts = Ti * T6;
Chris@82 486 T3 = W[4];
Chris@82 487 T4 = T2 * T3;
Chris@82 488 Tw = Ti * T3;
Chris@82 489 Tb = T5 * T3;
Chris@82 490 Tr = Tg * T3;
Chris@82 491 }
Chris@82 492 T8 = T4 + T7;
Chris@82 493 TW = Tv - Tw;
Chris@82 494 TJ = Ta + Tb;
Chris@82 495 Tt = Tr - Ts;
Chris@82 496 TU = Tr + Ts;
Chris@82 497 Tc = Ta - Tb;
Chris@82 498 Tx = Tv + Tw;
Chris@82 499 TH = T4 - T7;
Chris@82 500 TN = W[6];
Chris@82 501 TO = W[7];
Chris@82 502 TP = FMA(T2, TN, T5 * TO);
Chris@82 503 TR = FNMS(T5, TN, T2 * TO);
Chris@82 504 {
Chris@82 505 E T1d, T1e, T19, T1a;
Chris@82 506 T1d = Tk * T6;
Chris@82 507 T1e = To * T3;
Chris@82 508 T1f = T1d - T1e;
Chris@82 509 T1k = T1d + T1e;
Chris@82 510 T19 = Tk * T3;
Chris@82 511 T1a = To * T6;
Chris@82 512 T1b = T19 + T1a;
Chris@82 513 T1i = T19 - T1a;
Chris@82 514 }
Chris@82 515 {
Chris@82 516 E T1w, T1x, T1s, T1t;
Chris@82 517 T1w = TC * T6;
Chris@82 518 T1x = TE * T3;
Chris@82 519 T1y = T1w - T1x;
Chris@82 520 T1H = T1w + T1x;
Chris@82 521 T1s = TC * T3;
Chris@82 522 T1t = TE * T6;
Chris@82 523 T1u = T1s + T1t;
Chris@82 524 T1F = T1s - T1t;
Chris@82 525 }
Chris@82 526 }
Chris@82 527 {
Chris@82 528 E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
Chris@82 529 E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
Chris@82 530 E T2S, T2T, T28, T2A, T2d, T2B;
Chris@82 531 {
Chris@82 532 E T1, T3d, Te, T3c, T9, Td;
Chris@82 533 T1 = Rp[0];
Chris@82 534 T3d = Rm[0];
Chris@82 535 T9 = Rp[WS(rs, 4)];
Chris@82 536 Td = Rm[WS(rs, 4)];
Chris@82 537 Te = FMA(T8, T9, Tc * Td);
Chris@82 538 T3c = FNMS(Tc, T9, T8 * Td);
Chris@82 539 Tf = T1 + Te;
Chris@82 540 T3r = T3d - T3c;
Chris@82 541 T1N = T1 - Te;
Chris@82 542 T3e = T3c + T3d;
Chris@82 543 }
Chris@82 544 {
Chris@82 545 E Tq, T1O, Tz, T1P;
Chris@82 546 {
Chris@82 547 E Tl, Tp, Tu, Ty;
Chris@82 548 Tl = Rp[WS(rs, 2)];
Chris@82 549 Tp = Rm[WS(rs, 2)];
Chris@82 550 Tq = FMA(Tk, Tl, To * Tp);
Chris@82 551 T1O = FNMS(To, Tl, Tk * Tp);
Chris@82 552 Tu = Rp[WS(rs, 6)];
Chris@82 553 Ty = Rm[WS(rs, 6)];
Chris@82 554 Tz = FMA(Tt, Tu, Tx * Ty);
Chris@82 555 T1P = FNMS(Tx, Tu, Tt * Ty);
Chris@82 556 }
Chris@82 557 TA = Tq + Tz;
Chris@82 558 T3s = Tq - Tz;
Chris@82 559 T1Q = T1O - T1P;
Chris@82 560 T3b = T1O + T1P;
Chris@82 561 }
Chris@82 562 {
Chris@82 563 E TG, T1S, TL, T1T, T1U, T1V;
Chris@82 564 {
Chris@82 565 E TD, TF, TI, TK;
Chris@82 566 TD = Rp[WS(rs, 1)];
Chris@82 567 TF = Rm[WS(rs, 1)];
Chris@82 568 TG = FMA(TC, TD, TE * TF);
Chris@82 569 T1S = FNMS(TE, TD, TC * TF);
Chris@82 570 TI = Rp[WS(rs, 5)];
Chris@82 571 TK = Rm[WS(rs, 5)];
Chris@82 572 TL = FMA(TH, TI, TJ * TK);
Chris@82 573 T1T = FNMS(TJ, TI, TH * TK);
Chris@82 574 }
Chris@82 575 TM = TG + TL;
Chris@82 576 T2M = T1S + T1T;
Chris@82 577 T1U = T1S - T1T;
Chris@82 578 T1V = TG - TL;
Chris@82 579 T1W = T1U - T1V;
Chris@82 580 T2w = T1V + T1U;
Chris@82 581 }
Chris@82 582 {
Chris@82 583 E TT, T1Y, TY, T1Z, T1X, T20;
Chris@82 584 {
Chris@82 585 E TQ, TS, TV, TX;
Chris@82 586 TQ = Rp[WS(rs, 7)];
Chris@82 587 TS = Rm[WS(rs, 7)];
Chris@82 588 TT = FMA(TP, TQ, TR * TS);
Chris@82 589 T1Y = FNMS(TR, TQ, TP * TS);
Chris@82 590 TV = Rp[WS(rs, 3)];
Chris@82 591 TX = Rm[WS(rs, 3)];
Chris@82 592 TY = FMA(TU, TV, TW * TX);
Chris@82 593 T1Z = FNMS(TW, TV, TU * TX);
Chris@82 594 }
Chris@82 595 TZ = TT + TY;
Chris@82 596 T2N = T1Y + T1Z;
Chris@82 597 T1X = TT - TY;
Chris@82 598 T20 = T1Y - T1Z;
Chris@82 599 T21 = T1X + T20;
Chris@82 600 T2x = T1X - T20;
Chris@82 601 }
Chris@82 602 {
Chris@82 603 E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
Chris@82 604 {
Chris@82 605 E T1p, T1q, T1G, T1I;
Chris@82 606 T1p = Ip[WS(rs, 7)];
Chris@82 607 T1q = Im[WS(rs, 7)];
Chris@82 608 T1r = FMA(TN, T1p, TO * T1q);
Chris@82 609 T2k = FNMS(TO, T1p, TN * T1q);
Chris@82 610 T1G = Ip[WS(rs, 5)];
Chris@82 611 T1I = Im[WS(rs, 5)];
Chris@82 612 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@82 613 T2h = FNMS(T1H, T1G, T1F * T1I);
Chris@82 614 }
Chris@82 615 {
Chris@82 616 E T1v, T1z, T1C, T1D;
Chris@82 617 T1v = Ip[WS(rs, 3)];
Chris@82 618 T1z = Im[WS(rs, 3)];
Chris@82 619 T1A = FMA(T1u, T1v, T1y * T1z);
Chris@82 620 T2l = FNMS(T1y, T1v, T1u * T1z);
Chris@82 621 T1C = Ip[WS(rs, 1)];
Chris@82 622 T1D = Im[WS(rs, 1)];
Chris@82 623 T1E = FMA(Tg, T1C, Ti * T1D);
Chris@82 624 T2g = FNMS(Ti, T1C, Tg * T1D);
Chris@82 625 }
Chris@82 626 T1B = T1r + T1A;
Chris@82 627 T1K = T1E + T1J;
Chris@82 628 T2V = T1B - T1K;
Chris@82 629 T2W = T2k + T2l;
Chris@82 630 T2X = T2g + T2h;
Chris@82 631 T2Y = T2W - T2X;
Chris@82 632 {
Chris@82 633 E T2f, T2i, T2m, T2n;
Chris@82 634 T2f = T1r - T1A;
Chris@82 635 T2i = T2g - T2h;
Chris@82 636 T2j = T2f - T2i;
Chris@82 637 T2D = T2f + T2i;
Chris@82 638 T2m = T2k - T2l;
Chris@82 639 T2n = T1E - T1J;
Chris@82 640 T2o = T2m + T2n;
Chris@82 641 T2E = T2m - T2n;
Chris@82 642 }
Chris@82 643 }
Chris@82 644 {
Chris@82 645 E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
Chris@82 646 {
Chris@82 647 E T12, T13, T1j, T1l;
Chris@82 648 T12 = Ip[0];
Chris@82 649 T13 = Im[0];
Chris@82 650 T14 = FMA(T2, T12, T5 * T13);
Chris@82 651 T24 = FNMS(T5, T12, T2 * T13);
Chris@82 652 T1j = Ip[WS(rs, 6)];
Chris@82 653 T1l = Im[WS(rs, 6)];
Chris@82 654 T1m = FMA(T1i, T1j, T1k * T1l);
Chris@82 655 T2b = FNMS(T1k, T1j, T1i * T1l);
Chris@82 656 }
Chris@82 657 {
Chris@82 658 E T15, T16, T1c, T1g;
Chris@82 659 T15 = Ip[WS(rs, 4)];
Chris@82 660 T16 = Im[WS(rs, 4)];
Chris@82 661 T17 = FMA(T3, T15, T6 * T16);
Chris@82 662 T25 = FNMS(T6, T15, T3 * T16);
Chris@82 663 T1c = Ip[WS(rs, 2)];
Chris@82 664 T1g = Im[WS(rs, 2)];
Chris@82 665 T1h = FMA(T1b, T1c, T1f * T1g);
Chris@82 666 T2a = FNMS(T1f, T1c, T1b * T1g);
Chris@82 667 }
Chris@82 668 T18 = T14 + T17;
Chris@82 669 T1n = T1h + T1m;
Chris@82 670 T2Q = T18 - T1n;
Chris@82 671 T2R = T24 + T25;
Chris@82 672 T2S = T2a + T2b;
Chris@82 673 T2T = T2R - T2S;
Chris@82 674 {
Chris@82 675 E T26, T27, T29, T2c;
Chris@82 676 T26 = T24 - T25;
Chris@82 677 T27 = T1h - T1m;
Chris@82 678 T28 = T26 + T27;
Chris@82 679 T2A = T26 - T27;
Chris@82 680 T29 = T14 - T17;
Chris@82 681 T2c = T2a - T2b;
Chris@82 682 T2d = T29 - T2c;
Chris@82 683 T2B = T29 + T2c;
Chris@82 684 }
Chris@82 685 }
Chris@82 686 {
Chris@82 687 E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
Chris@82 688 {
Chris@82 689 E T1R, T22, T3y, T3z;
Chris@82 690 T1R = T1N - T1Q;
Chris@82 691 T22 = KP707106781 * (T1W - T21);
Chris@82 692 T23 = T1R + T22;
Chris@82 693 T2r = T1R - T22;
Chris@82 694 T3y = KP707106781 * (T2x - T2w);
Chris@82 695 T3z = T3s + T3r;
Chris@82 696 T3A = T3y + T3z;
Chris@82 697 T3C = T3z - T3y;
Chris@82 698 }
Chris@82 699 {
Chris@82 700 E T2e, T2p, T2s, T2t;
Chris@82 701 T2e = FMA(KP923879532, T28, KP382683432 * T2d);
Chris@82 702 T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
Chris@82 703 T2q = T2e + T2p;
Chris@82 704 T3B = T2p - T2e;
Chris@82 705 T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
Chris@82 706 T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
Chris@82 707 T2u = T2s - T2t;
Chris@82 708 T3x = T2s + T2t;
Chris@82 709 }
Chris@82 710 Rm[WS(rs, 4)] = T23 - T2q;
Chris@82 711 Im[WS(rs, 4)] = T3x - T3A;
Chris@82 712 Rp[WS(rs, 3)] = T23 + T2q;
Chris@82 713 Ip[WS(rs, 3)] = T3x + T3A;
Chris@82 714 Rm[0] = T2r - T2u;
Chris@82 715 Im[0] = T3B - T3C;
Chris@82 716 Rp[WS(rs, 7)] = T2r + T2u;
Chris@82 717 Ip[WS(rs, 7)] = T3B + T3C;
Chris@82 718 }
Chris@82 719 {
Chris@82 720 E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
Chris@82 721 {
Chris@82 722 E T2L, T2O, T3k, T3l;
Chris@82 723 T2L = Tf - TA;
Chris@82 724 T2O = T2M - T2N;
Chris@82 725 T2P = T2L + T2O;
Chris@82 726 T31 = T2L - T2O;
Chris@82 727 T3k = TZ - TM;
Chris@82 728 T3l = T3e - T3b;
Chris@82 729 T3m = T3k + T3l;
Chris@82 730 T3o = T3l - T3k;
Chris@82 731 }
Chris@82 732 {
Chris@82 733 E T2U, T2Z, T32, T33;
Chris@82 734 T2U = T2Q + T2T;
Chris@82 735 T2Z = T2V - T2Y;
Chris@82 736 T30 = KP707106781 * (T2U + T2Z);
Chris@82 737 T3n = KP707106781 * (T2Z - T2U);
Chris@82 738 T32 = T2T - T2Q;
Chris@82 739 T33 = T2V + T2Y;
Chris@82 740 T34 = KP707106781 * (T32 - T33);
Chris@82 741 T3j = KP707106781 * (T32 + T33);
Chris@82 742 }
Chris@82 743 Rm[WS(rs, 5)] = T2P - T30;
Chris@82 744 Im[WS(rs, 5)] = T3j - T3m;
Chris@82 745 Rp[WS(rs, 2)] = T2P + T30;
Chris@82 746 Ip[WS(rs, 2)] = T3j + T3m;
Chris@82 747 Rm[WS(rs, 1)] = T31 - T34;
Chris@82 748 Im[WS(rs, 1)] = T3n - T3o;
Chris@82 749 Rp[WS(rs, 6)] = T31 + T34;
Chris@82 750 Ip[WS(rs, 6)] = T3n + T3o;
Chris@82 751 }
Chris@82 752 {
Chris@82 753 E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
Chris@82 754 {
Chris@82 755 E T2v, T2y, T3q, T3t;
Chris@82 756 T2v = T1N + T1Q;
Chris@82 757 T2y = KP707106781 * (T2w + T2x);
Chris@82 758 T2z = T2v + T2y;
Chris@82 759 T2H = T2v - T2y;
Chris@82 760 T3q = KP707106781 * (T1W + T21);
Chris@82 761 T3t = T3r - T3s;
Chris@82 762 T3u = T3q + T3t;
Chris@82 763 T3w = T3t - T3q;
Chris@82 764 }
Chris@82 765 {
Chris@82 766 E T2C, T2F, T2I, T2J;
Chris@82 767 T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
Chris@82 768 T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
Chris@82 769 T2G = T2C + T2F;
Chris@82 770 T3v = T2F - T2C;
Chris@82 771 T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
Chris@82 772 T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
Chris@82 773 T2K = T2I - T2J;
Chris@82 774 T3p = T2I + T2J;
Chris@82 775 }
Chris@82 776 Rm[WS(rs, 6)] = T2z - T2G;
Chris@82 777 Im[WS(rs, 6)] = T3p - T3u;
Chris@82 778 Rp[WS(rs, 1)] = T2z + T2G;
Chris@82 779 Ip[WS(rs, 1)] = T3p + T3u;
Chris@82 780 Rm[WS(rs, 2)] = T2H - T2K;
Chris@82 781 Im[WS(rs, 2)] = T3v - T3w;
Chris@82 782 Rp[WS(rs, 5)] = T2H + T2K;
Chris@82 783 Ip[WS(rs, 5)] = T3v + T3w;
Chris@82 784 }
Chris@82 785 {
Chris@82 786 E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
Chris@82 787 {
Chris@82 788 E TB, T10, T3a, T3f;
Chris@82 789 TB = Tf + TA;
Chris@82 790 T10 = TM + TZ;
Chris@82 791 T11 = TB + T10;
Chris@82 792 T35 = TB - T10;
Chris@82 793 T3a = T2M + T2N;
Chris@82 794 T3f = T3b + T3e;
Chris@82 795 T3g = T3a + T3f;
Chris@82 796 T3i = T3f - T3a;
Chris@82 797 }
Chris@82 798 {
Chris@82 799 E T1o, T1L, T36, T37;
Chris@82 800 T1o = T18 + T1n;
Chris@82 801 T1L = T1B + T1K;
Chris@82 802 T1M = T1o + T1L;
Chris@82 803 T3h = T1L - T1o;
Chris@82 804 T36 = T2R + T2S;
Chris@82 805 T37 = T2W + T2X;
Chris@82 806 T38 = T36 - T37;
Chris@82 807 T39 = T36 + T37;
Chris@82 808 }
Chris@82 809 Rm[WS(rs, 7)] = T11 - T1M;
Chris@82 810 Im[WS(rs, 7)] = T39 - T3g;
Chris@82 811 Rp[0] = T11 + T1M;
Chris@82 812 Ip[0] = T39 + T3g;
Chris@82 813 Rm[WS(rs, 3)] = T35 - T38;
Chris@82 814 Im[WS(rs, 3)] = T3h - T3i;
Chris@82 815 Rp[WS(rs, 4)] = T35 + T38;
Chris@82 816 Ip[WS(rs, 4)] = T3h + T3i;
Chris@82 817 }
Chris@82 818 }
Chris@82 819 }
Chris@82 820 }
Chris@82 821 }
Chris@82 822
Chris@82 823 static const tw_instr twinstr[] = {
Chris@82 824 {TW_CEXP, 1, 1},
Chris@82 825 {TW_CEXP, 1, 3},
Chris@82 826 {TW_CEXP, 1, 9},
Chris@82 827 {TW_CEXP, 1, 15},
Chris@82 828 {TW_NEXT, 1, 0}
Chris@82 829 };
Chris@82 830
Chris@82 831 static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, {156, 68, 40, 0} };
Chris@82 832
Chris@82 833 void X(codelet_hc2cf2_16) (planner *p) {
Chris@82 834 X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT);
Chris@82 835 }
Chris@82 836 #endif