annotate src/fftw-3.3.8/rdft/scalar/r2cb/hc2cbdft_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:58 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include rdft/scalar/hc2cb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 206 FP additions, 100 FP multiplications,
Chris@82 32 * (or, 136 additions, 30 multiplications, 70 fused multiply/add),
Chris@82 33 * 66 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cb.h"
Chris@82 36
Chris@82 37 static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 45 E Tf, T20, T32, T3Q, T3f, T3V, TN, T2a, T1m, T2f, T2G, T3G, T2T, T3L, T1F;
Chris@82 46 E T26, T2J, T2M, T2N, T2U, T2V, T3H, Tu, T25, T3i, T3R, T1a, T2g, T1y, T21;
Chris@82 47 E T39, T3W, T1p, T2b;
Chris@82 48 {
Chris@82 49 E T3, T1e, TA, T1C, T6, Tx, T1h, T1D, Td, T1A, TL, T1k, Ta, T1z, TG;
Chris@82 50 E T1j;
Chris@82 51 {
Chris@82 52 E T1, T2, T1f, T1g;
Chris@82 53 T1 = Rp[0];
Chris@82 54 T2 = Rm[WS(rs, 7)];
Chris@82 55 T3 = T1 + T2;
Chris@82 56 T1e = T1 - T2;
Chris@82 57 {
Chris@82 58 E Ty, Tz, T4, T5;
Chris@82 59 Ty = Ip[0];
Chris@82 60 Tz = Im[WS(rs, 7)];
Chris@82 61 TA = Ty + Tz;
Chris@82 62 T1C = Ty - Tz;
Chris@82 63 T4 = Rp[WS(rs, 4)];
Chris@82 64 T5 = Rm[WS(rs, 3)];
Chris@82 65 T6 = T4 + T5;
Chris@82 66 Tx = T4 - T5;
Chris@82 67 }
Chris@82 68 T1f = Ip[WS(rs, 4)];
Chris@82 69 T1g = Im[WS(rs, 3)];
Chris@82 70 T1h = T1f + T1g;
Chris@82 71 T1D = T1f - T1g;
Chris@82 72 {
Chris@82 73 E Tb, Tc, TH, TI, TJ, TK;
Chris@82 74 Tb = Rm[WS(rs, 1)];
Chris@82 75 Tc = Rp[WS(rs, 6)];
Chris@82 76 TH = Tb - Tc;
Chris@82 77 TI = Im[WS(rs, 1)];
Chris@82 78 TJ = Ip[WS(rs, 6)];
Chris@82 79 TK = TI + TJ;
Chris@82 80 Td = Tb + Tc;
Chris@82 81 T1A = TJ - TI;
Chris@82 82 TL = TH + TK;
Chris@82 83 T1k = TH - TK;
Chris@82 84 }
Chris@82 85 {
Chris@82 86 E T8, T9, TC, TD, TE, TF;
Chris@82 87 T8 = Rp[WS(rs, 2)];
Chris@82 88 T9 = Rm[WS(rs, 5)];
Chris@82 89 TC = T8 - T9;
Chris@82 90 TD = Ip[WS(rs, 2)];
Chris@82 91 TE = Im[WS(rs, 5)];
Chris@82 92 TF = TD + TE;
Chris@82 93 Ta = T8 + T9;
Chris@82 94 T1z = TD - TE;
Chris@82 95 TG = TC + TF;
Chris@82 96 T1j = TC - TF;
Chris@82 97 }
Chris@82 98 }
Chris@82 99 {
Chris@82 100 E T7, Te, T30, T31;
Chris@82 101 T7 = T3 + T6;
Chris@82 102 Te = Ta + Td;
Chris@82 103 Tf = T7 + Te;
Chris@82 104 T20 = T7 - Te;
Chris@82 105 T30 = TA - Tx;
Chris@82 106 T31 = T1j - T1k;
Chris@82 107 T32 = FMA(KP707106781, T31, T30);
Chris@82 108 T3Q = FNMS(KP707106781, T31, T30);
Chris@82 109 }
Chris@82 110 {
Chris@82 111 E T3d, T3e, TB, TM;
Chris@82 112 T3d = T1e + T1h;
Chris@82 113 T3e = TG + TL;
Chris@82 114 T3f = FNMS(KP707106781, T3e, T3d);
Chris@82 115 T3V = FMA(KP707106781, T3e, T3d);
Chris@82 116 TB = Tx + TA;
Chris@82 117 TM = TG - TL;
Chris@82 118 TN = FMA(KP707106781, TM, TB);
Chris@82 119 T2a = FNMS(KP707106781, TM, TB);
Chris@82 120 }
Chris@82 121 {
Chris@82 122 E T1i, T1l, T2E, T2F;
Chris@82 123 T1i = T1e - T1h;
Chris@82 124 T1l = T1j + T1k;
Chris@82 125 T1m = FMA(KP707106781, T1l, T1i);
Chris@82 126 T2f = FNMS(KP707106781, T1l, T1i);
Chris@82 127 T2E = T3 - T6;
Chris@82 128 T2F = T1A - T1z;
Chris@82 129 T2G = T2E + T2F;
Chris@82 130 T3G = T2E - T2F;
Chris@82 131 }
Chris@82 132 {
Chris@82 133 E T2R, T2S, T1B, T1E;
Chris@82 134 T2R = Ta - Td;
Chris@82 135 T2S = T1C - T1D;
Chris@82 136 T2T = T2R + T2S;
Chris@82 137 T3L = T2S - T2R;
Chris@82 138 T1B = T1z + T1A;
Chris@82 139 T1E = T1C + T1D;
Chris@82 140 T1F = T1B + T1E;
Chris@82 141 T26 = T1E - T1B;
Chris@82 142 }
Chris@82 143 }
Chris@82 144 {
Chris@82 145 E Ti, T1s, Tl, T1t, TS, TX, T34, T33, T2I, T2H, Tp, T1v, Ts, T1w, T13;
Chris@82 146 E T18, T37, T36, T2L, T2K;
Chris@82 147 {
Chris@82 148 E TT, TR, TO, TW;
Chris@82 149 {
Chris@82 150 E Tg, Th, TP, TQ;
Chris@82 151 Tg = Rp[WS(rs, 1)];
Chris@82 152 Th = Rm[WS(rs, 6)];
Chris@82 153 Ti = Tg + Th;
Chris@82 154 TT = Tg - Th;
Chris@82 155 TP = Ip[WS(rs, 1)];
Chris@82 156 TQ = Im[WS(rs, 6)];
Chris@82 157 TR = TP + TQ;
Chris@82 158 T1s = TP - TQ;
Chris@82 159 }
Chris@82 160 {
Chris@82 161 E Tj, Tk, TU, TV;
Chris@82 162 Tj = Rp[WS(rs, 5)];
Chris@82 163 Tk = Rm[WS(rs, 2)];
Chris@82 164 Tl = Tj + Tk;
Chris@82 165 TO = Tj - Tk;
Chris@82 166 TU = Ip[WS(rs, 5)];
Chris@82 167 TV = Im[WS(rs, 2)];
Chris@82 168 TW = TU + TV;
Chris@82 169 T1t = TU - TV;
Chris@82 170 }
Chris@82 171 TS = TO + TR;
Chris@82 172 TX = TT - TW;
Chris@82 173 T34 = TR - TO;
Chris@82 174 T33 = TT + TW;
Chris@82 175 T2I = T1s - T1t;
Chris@82 176 T2H = Ti - Tl;
Chris@82 177 }
Chris@82 178 {
Chris@82 179 E T14, T12, TZ, T17;
Chris@82 180 {
Chris@82 181 E Tn, To, T10, T11;
Chris@82 182 Tn = Rm[0];
Chris@82 183 To = Rp[WS(rs, 7)];
Chris@82 184 Tp = Tn + To;
Chris@82 185 T14 = Tn - To;
Chris@82 186 T10 = Im[0];
Chris@82 187 T11 = Ip[WS(rs, 7)];
Chris@82 188 T12 = T10 + T11;
Chris@82 189 T1v = T11 - T10;
Chris@82 190 }
Chris@82 191 {
Chris@82 192 E Tq, Tr, T15, T16;
Chris@82 193 Tq = Rp[WS(rs, 3)];
Chris@82 194 Tr = Rm[WS(rs, 4)];
Chris@82 195 Ts = Tq + Tr;
Chris@82 196 TZ = Tq - Tr;
Chris@82 197 T15 = Ip[WS(rs, 3)];
Chris@82 198 T16 = Im[WS(rs, 4)];
Chris@82 199 T17 = T15 + T16;
Chris@82 200 T1w = T15 - T16;
Chris@82 201 }
Chris@82 202 T13 = TZ - T12;
Chris@82 203 T18 = T14 - T17;
Chris@82 204 T37 = TZ + T12;
Chris@82 205 T36 = T14 + T17;
Chris@82 206 T2L = T1v - T1w;
Chris@82 207 T2K = Tp - Ts;
Chris@82 208 }
Chris@82 209 T2J = T2H - T2I;
Chris@82 210 T2M = T2K + T2L;
Chris@82 211 T2N = T2J + T2M;
Chris@82 212 T2U = T2H + T2I;
Chris@82 213 T2V = T2L - T2K;
Chris@82 214 T3H = T2V - T2U;
Chris@82 215 {
Chris@82 216 E Tm, Tt, T3g, T3h;
Chris@82 217 Tm = Ti + Tl;
Chris@82 218 Tt = Tp + Ts;
Chris@82 219 Tu = Tm + Tt;
Chris@82 220 T25 = Tm - Tt;
Chris@82 221 T3g = FNMS(KP414213562, T33, T34);
Chris@82 222 T3h = FNMS(KP414213562, T36, T37);
Chris@82 223 T3i = T3g + T3h;
Chris@82 224 T3R = T3h - T3g;
Chris@82 225 }
Chris@82 226 {
Chris@82 227 E TY, T19, T1u, T1x;
Chris@82 228 TY = FMA(KP414213562, TX, TS);
Chris@82 229 T19 = FNMS(KP414213562, T18, T13);
Chris@82 230 T1a = TY + T19;
Chris@82 231 T2g = T19 - TY;
Chris@82 232 T1u = T1s + T1t;
Chris@82 233 T1x = T1v + T1w;
Chris@82 234 T1y = T1u + T1x;
Chris@82 235 T21 = T1x - T1u;
Chris@82 236 }
Chris@82 237 {
Chris@82 238 E T35, T38, T1n, T1o;
Chris@82 239 T35 = FMA(KP414213562, T34, T33);
Chris@82 240 T38 = FMA(KP414213562, T37, T36);
Chris@82 241 T39 = T35 - T38;
Chris@82 242 T3W = T35 + T38;
Chris@82 243 T1n = FNMS(KP414213562, TS, TX);
Chris@82 244 T1o = FMA(KP414213562, T13, T18);
Chris@82 245 T1p = T1n + T1o;
Chris@82 246 T2b = T1n - T1o;
Chris@82 247 }
Chris@82 248 }
Chris@82 249 {
Chris@82 250 E Tv, T1G, T1b, T1q, T1c, T1H, Tw, T1r, T1I, T1d;
Chris@82 251 Tv = Tf + Tu;
Chris@82 252 T1G = T1y + T1F;
Chris@82 253 T1b = FMA(KP923879532, T1a, TN);
Chris@82 254 T1q = FMA(KP923879532, T1p, T1m);
Chris@82 255 Tw = W[0];
Chris@82 256 T1c = Tw * T1b;
Chris@82 257 T1H = Tw * T1q;
Chris@82 258 T1d = W[1];
Chris@82 259 T1r = FMA(T1d, T1q, T1c);
Chris@82 260 T1I = FNMS(T1d, T1b, T1H);
Chris@82 261 Rp[0] = Tv - T1r;
Chris@82 262 Ip[0] = T1G + T1I;
Chris@82 263 Rm[0] = Tv + T1r;
Chris@82 264 Im[0] = T1I - T1G;
Chris@82 265 }
Chris@82 266 {
Chris@82 267 E T1N, T1J, T1L, T1M, T1V, T1Q, T1T, T1R, T1X, T1K, T1P;
Chris@82 268 T1N = T1F - T1y;
Chris@82 269 T1K = Tf - Tu;
Chris@82 270 T1J = W[14];
Chris@82 271 T1L = T1J * T1K;
Chris@82 272 T1M = W[15];
Chris@82 273 T1V = T1M * T1K;
Chris@82 274 T1Q = FNMS(KP923879532, T1a, TN);
Chris@82 275 T1T = FNMS(KP923879532, T1p, T1m);
Chris@82 276 T1P = W[16];
Chris@82 277 T1R = T1P * T1Q;
Chris@82 278 T1X = T1P * T1T;
Chris@82 279 {
Chris@82 280 E T1O, T1W, T1U, T1Y, T1S;
Chris@82 281 T1O = FNMS(T1M, T1N, T1L);
Chris@82 282 T1W = FMA(T1J, T1N, T1V);
Chris@82 283 T1S = W[17];
Chris@82 284 T1U = FMA(T1S, T1T, T1R);
Chris@82 285 T1Y = FNMS(T1S, T1Q, T1X);
Chris@82 286 Rp[WS(rs, 4)] = T1O - T1U;
Chris@82 287 Ip[WS(rs, 4)] = T1W + T1Y;
Chris@82 288 Rm[WS(rs, 4)] = T1O + T1U;
Chris@82 289 Im[WS(rs, 4)] = T1Y - T1W;
Chris@82 290 }
Chris@82 291 }
Chris@82 292 {
Chris@82 293 E T2r, T2n, T2p, T2q, T2z, T2u, T2x, T2v, T2B, T2o, T2t;
Chris@82 294 T2r = T26 - T25;
Chris@82 295 T2o = T20 - T21;
Chris@82 296 T2n = W[22];
Chris@82 297 T2p = T2n * T2o;
Chris@82 298 T2q = W[23];
Chris@82 299 T2z = T2q * T2o;
Chris@82 300 T2u = FNMS(KP923879532, T2b, T2a);
Chris@82 301 T2x = FNMS(KP923879532, T2g, T2f);
Chris@82 302 T2t = W[24];
Chris@82 303 T2v = T2t * T2u;
Chris@82 304 T2B = T2t * T2x;
Chris@82 305 {
Chris@82 306 E T2s, T2A, T2y, T2C, T2w;
Chris@82 307 T2s = FNMS(T2q, T2r, T2p);
Chris@82 308 T2A = FMA(T2n, T2r, T2z);
Chris@82 309 T2w = W[25];
Chris@82 310 T2y = FMA(T2w, T2x, T2v);
Chris@82 311 T2C = FNMS(T2w, T2u, T2B);
Chris@82 312 Rp[WS(rs, 6)] = T2s - T2y;
Chris@82 313 Ip[WS(rs, 6)] = T2A + T2C;
Chris@82 314 Rm[WS(rs, 6)] = T2s + T2y;
Chris@82 315 Im[WS(rs, 6)] = T2C - T2A;
Chris@82 316 }
Chris@82 317 }
Chris@82 318 {
Chris@82 319 E T27, T1Z, T23, T24, T2j, T2c, T2h, T2d, T2l, T22, T29;
Chris@82 320 T27 = T25 + T26;
Chris@82 321 T22 = T20 + T21;
Chris@82 322 T1Z = W[6];
Chris@82 323 T23 = T1Z * T22;
Chris@82 324 T24 = W[7];
Chris@82 325 T2j = T24 * T22;
Chris@82 326 T2c = FMA(KP923879532, T2b, T2a);
Chris@82 327 T2h = FMA(KP923879532, T2g, T2f);
Chris@82 328 T29 = W[8];
Chris@82 329 T2d = T29 * T2c;
Chris@82 330 T2l = T29 * T2h;
Chris@82 331 {
Chris@82 332 E T28, T2k, T2i, T2m, T2e;
Chris@82 333 T28 = FNMS(T24, T27, T23);
Chris@82 334 T2k = FMA(T1Z, T27, T2j);
Chris@82 335 T2e = W[9];
Chris@82 336 T2i = FMA(T2e, T2h, T2d);
Chris@82 337 T2m = FNMS(T2e, T2c, T2l);
Chris@82 338 Rp[WS(rs, 2)] = T28 - T2i;
Chris@82 339 Ip[WS(rs, 2)] = T2k + T2m;
Chris@82 340 Rm[WS(rs, 2)] = T28 + T2i;
Chris@82 341 Im[WS(rs, 2)] = T2m - T2k;
Chris@82 342 }
Chris@82 343 }
Chris@82 344 {
Chris@82 345 E T3N, T47, T43, T45, T46, T4f, T3F, T3J, T3K, T3Z, T3S, T3X, T3T, T41, T4a;
Chris@82 346 E T4d, T4b, T4h;
Chris@82 347 {
Chris@82 348 E T3M, T44, T3I, T3P, T49;
Chris@82 349 T3M = T2J - T2M;
Chris@82 350 T3N = FMA(KP707106781, T3M, T3L);
Chris@82 351 T47 = FNMS(KP707106781, T3M, T3L);
Chris@82 352 T44 = FNMS(KP707106781, T3H, T3G);
Chris@82 353 T43 = W[26];
Chris@82 354 T45 = T43 * T44;
Chris@82 355 T46 = W[27];
Chris@82 356 T4f = T46 * T44;
Chris@82 357 T3I = FMA(KP707106781, T3H, T3G);
Chris@82 358 T3F = W[10];
Chris@82 359 T3J = T3F * T3I;
Chris@82 360 T3K = W[11];
Chris@82 361 T3Z = T3K * T3I;
Chris@82 362 T3S = FMA(KP923879532, T3R, T3Q);
Chris@82 363 T3X = FNMS(KP923879532, T3W, T3V);
Chris@82 364 T3P = W[12];
Chris@82 365 T3T = T3P * T3S;
Chris@82 366 T41 = T3P * T3X;
Chris@82 367 T4a = FNMS(KP923879532, T3R, T3Q);
Chris@82 368 T4d = FMA(KP923879532, T3W, T3V);
Chris@82 369 T49 = W[28];
Chris@82 370 T4b = T49 * T4a;
Chris@82 371 T4h = T49 * T4d;
Chris@82 372 }
Chris@82 373 {
Chris@82 374 E T3O, T40, T3Y, T42, T3U;
Chris@82 375 T3O = FNMS(T3K, T3N, T3J);
Chris@82 376 T40 = FMA(T3F, T3N, T3Z);
Chris@82 377 T3U = W[13];
Chris@82 378 T3Y = FMA(T3U, T3X, T3T);
Chris@82 379 T42 = FNMS(T3U, T3S, T41);
Chris@82 380 Rp[WS(rs, 3)] = T3O - T3Y;
Chris@82 381 Ip[WS(rs, 3)] = T40 + T42;
Chris@82 382 Rm[WS(rs, 3)] = T3O + T3Y;
Chris@82 383 Im[WS(rs, 3)] = T42 - T40;
Chris@82 384 }
Chris@82 385 {
Chris@82 386 E T48, T4g, T4e, T4i, T4c;
Chris@82 387 T48 = FNMS(T46, T47, T45);
Chris@82 388 T4g = FMA(T43, T47, T4f);
Chris@82 389 T4c = W[29];
Chris@82 390 T4e = FMA(T4c, T4d, T4b);
Chris@82 391 T4i = FNMS(T4c, T4a, T4h);
Chris@82 392 Rp[WS(rs, 7)] = T48 - T4e;
Chris@82 393 Ip[WS(rs, 7)] = T4g + T4i;
Chris@82 394 Rm[WS(rs, 7)] = T48 + T4e;
Chris@82 395 Im[WS(rs, 7)] = T4i - T4g;
Chris@82 396 }
Chris@82 397 }
Chris@82 398 {
Chris@82 399 E T2X, T3t, T3p, T3r, T3s, T3B, T2D, T2P, T2Q, T3l, T3a, T3j, T3b, T3n, T3w;
Chris@82 400 E T3z, T3x, T3D;
Chris@82 401 {
Chris@82 402 E T2W, T3q, T2O, T2Z, T3v;
Chris@82 403 T2W = T2U + T2V;
Chris@82 404 T2X = FMA(KP707106781, T2W, T2T);
Chris@82 405 T3t = FNMS(KP707106781, T2W, T2T);
Chris@82 406 T3q = FNMS(KP707106781, T2N, T2G);
Chris@82 407 T3p = W[18];
Chris@82 408 T3r = T3p * T3q;
Chris@82 409 T3s = W[19];
Chris@82 410 T3B = T3s * T3q;
Chris@82 411 T2O = FMA(KP707106781, T2N, T2G);
Chris@82 412 T2D = W[2];
Chris@82 413 T2P = T2D * T2O;
Chris@82 414 T2Q = W[3];
Chris@82 415 T3l = T2Q * T2O;
Chris@82 416 T3a = FMA(KP923879532, T39, T32);
Chris@82 417 T3j = FNMS(KP923879532, T3i, T3f);
Chris@82 418 T2Z = W[4];
Chris@82 419 T3b = T2Z * T3a;
Chris@82 420 T3n = T2Z * T3j;
Chris@82 421 T3w = FNMS(KP923879532, T39, T32);
Chris@82 422 T3z = FMA(KP923879532, T3i, T3f);
Chris@82 423 T3v = W[20];
Chris@82 424 T3x = T3v * T3w;
Chris@82 425 T3D = T3v * T3z;
Chris@82 426 }
Chris@82 427 {
Chris@82 428 E T2Y, T3m, T3k, T3o, T3c;
Chris@82 429 T2Y = FNMS(T2Q, T2X, T2P);
Chris@82 430 T3m = FMA(T2D, T2X, T3l);
Chris@82 431 T3c = W[5];
Chris@82 432 T3k = FMA(T3c, T3j, T3b);
Chris@82 433 T3o = FNMS(T3c, T3a, T3n);
Chris@82 434 Rp[WS(rs, 1)] = T2Y - T3k;
Chris@82 435 Ip[WS(rs, 1)] = T3m + T3o;
Chris@82 436 Rm[WS(rs, 1)] = T2Y + T3k;
Chris@82 437 Im[WS(rs, 1)] = T3o - T3m;
Chris@82 438 }
Chris@82 439 {
Chris@82 440 E T3u, T3C, T3A, T3E, T3y;
Chris@82 441 T3u = FNMS(T3s, T3t, T3r);
Chris@82 442 T3C = FMA(T3p, T3t, T3B);
Chris@82 443 T3y = W[21];
Chris@82 444 T3A = FMA(T3y, T3z, T3x);
Chris@82 445 T3E = FNMS(T3y, T3w, T3D);
Chris@82 446 Rp[WS(rs, 5)] = T3u - T3A;
Chris@82 447 Ip[WS(rs, 5)] = T3C + T3E;
Chris@82 448 Rm[WS(rs, 5)] = T3u + T3A;
Chris@82 449 Im[WS(rs, 5)] = T3E - T3C;
Chris@82 450 }
Chris@82 451 }
Chris@82 452 }
Chris@82 453 }
Chris@82 454 }
Chris@82 455
Chris@82 456 static const tw_instr twinstr[] = {
Chris@82 457 {TW_FULL, 1, 16},
Chris@82 458 {TW_NEXT, 1, 0}
Chris@82 459 };
Chris@82 460
Chris@82 461 static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, {136, 30, 70, 0} };
Chris@82 462
Chris@82 463 void X(codelet_hc2cbdft_16) (planner *p) {
Chris@82 464 X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
Chris@82 465 }
Chris@82 466 #else
Chris@82 467
Chris@82 468 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include rdft/scalar/hc2cb.h */
Chris@82 469
Chris@82 470 /*
Chris@82 471 * This function contains 206 FP additions, 84 FP multiplications,
Chris@82 472 * (or, 168 additions, 46 multiplications, 38 fused multiply/add),
Chris@82 473 * 60 stack variables, 3 constants, and 64 memory accesses
Chris@82 474 */
Chris@82 475 #include "rdft/scalar/hc2cb.h"
Chris@82 476
Chris@82 477 static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 478 {
Chris@82 479 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 480 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 481 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 482 {
Chris@82 483 INT m;
Chris@82 484 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 485 E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
Chris@82 486 E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
Chris@82 487 E T1x, T1V;
Chris@82 488 {
Chris@82 489 E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
Chris@82 490 E T1z;
Chris@82 491 {
Chris@82 492 E T1, T2, Ty, Tz;
Chris@82 493 T1 = Rp[0];
Chris@82 494 T2 = Rm[WS(rs, 7)];
Chris@82 495 T3 = T1 + T2;
Chris@82 496 T1j = T1 - T2;
Chris@82 497 Ty = Ip[0];
Chris@82 498 Tz = Im[WS(rs, 7)];
Chris@82 499 TA = Ty + Tz;
Chris@82 500 T1B = Ty - Tz;
Chris@82 501 }
Chris@82 502 {
Chris@82 503 E T4, T5, T1k, T1l;
Chris@82 504 T4 = Rp[WS(rs, 4)];
Chris@82 505 T5 = Rm[WS(rs, 3)];
Chris@82 506 T6 = T4 + T5;
Chris@82 507 Tx = T4 - T5;
Chris@82 508 T1k = Ip[WS(rs, 4)];
Chris@82 509 T1l = Im[WS(rs, 3)];
Chris@82 510 T1m = T1k + T1l;
Chris@82 511 T1C = T1k - T1l;
Chris@82 512 }
Chris@82 513 {
Chris@82 514 E T8, T9, TD, TE;
Chris@82 515 T8 = Rp[WS(rs, 2)];
Chris@82 516 T9 = Rm[WS(rs, 5)];
Chris@82 517 Ta = T8 + T9;
Chris@82 518 TC = T8 - T9;
Chris@82 519 TD = Ip[WS(rs, 2)];
Chris@82 520 TE = Im[WS(rs, 5)];
Chris@82 521 TF = TD + TE;
Chris@82 522 T1y = TD - TE;
Chris@82 523 }
Chris@82 524 {
Chris@82 525 E Tb, Tc, TI, TJ;
Chris@82 526 Tb = Rm[WS(rs, 1)];
Chris@82 527 Tc = Rp[WS(rs, 6)];
Chris@82 528 Td = Tb + Tc;
Chris@82 529 TH = Tb - Tc;
Chris@82 530 TI = Im[WS(rs, 1)];
Chris@82 531 TJ = Ip[WS(rs, 6)];
Chris@82 532 TK = TI + TJ;
Chris@82 533 T1z = TJ - TI;
Chris@82 534 }
Chris@82 535 {
Chris@82 536 E T7, Te, TG, TL;
Chris@82 537 TB = Tx + TA;
Chris@82 538 T2L = TA - Tx;
Chris@82 539 T30 = T1j + T1m;
Chris@82 540 T1n = T1j - T1m;
Chris@82 541 T7 = T3 + T6;
Chris@82 542 Te = Ta + Td;
Chris@82 543 Tf = T7 + Te;
Chris@82 544 T1U = T7 - Te;
Chris@82 545 {
Chris@82 546 E T2F, T2G, T1A, T1D;
Chris@82 547 T2F = Ta - Td;
Chris@82 548 T2G = T1B - T1C;
Chris@82 549 T2H = T2F + T2G;
Chris@82 550 T3p = T2G - T2F;
Chris@82 551 T1A = T1y + T1z;
Chris@82 552 T1D = T1B + T1C;
Chris@82 553 T1E = T1A + T1D;
Chris@82 554 T1Z = T1D - T1A;
Chris@82 555 }
Chris@82 556 TG = TC + TF;
Chris@82 557 TL = TH + TK;
Chris@82 558 TM = KP707106781 * (TG - TL);
Chris@82 559 T31 = KP707106781 * (TG + TL);
Chris@82 560 {
Chris@82 561 E T2q, T2r, T1g, T1h;
Chris@82 562 T2q = T3 - T6;
Chris@82 563 T2r = T1z - T1y;
Chris@82 564 T2s = T2q + T2r;
Chris@82 565 T3k = T2q - T2r;
Chris@82 566 T1g = TC - TF;
Chris@82 567 T1h = TH - TK;
Chris@82 568 T1i = KP707106781 * (T1g + T1h);
Chris@82 569 T2M = KP707106781 * (T1g - T1h);
Chris@82 570 }
Chris@82 571 }
Chris@82 572 }
Chris@82 573 {
Chris@82 574 E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
Chris@82 575 E T1v;
Chris@82 576 {
Chris@82 577 E Tg, Th, TP, TQ;
Chris@82 578 Tg = Rp[WS(rs, 1)];
Chris@82 579 Th = Rm[WS(rs, 6)];
Chris@82 580 Ti = Tg + Th;
Chris@82 581 TT = Tg - Th;
Chris@82 582 TP = Ip[WS(rs, 1)];
Chris@82 583 TQ = Im[WS(rs, 6)];
Chris@82 584 TR = TP + TQ;
Chris@82 585 T1r = TP - TQ;
Chris@82 586 }
Chris@82 587 {
Chris@82 588 E Tj, Tk, TU, TV;
Chris@82 589 Tj = Rp[WS(rs, 5)];
Chris@82 590 Tk = Rm[WS(rs, 2)];
Chris@82 591 Tl = Tj + Tk;
Chris@82 592 TO = Tj - Tk;
Chris@82 593 TU = Ip[WS(rs, 5)];
Chris@82 594 TV = Im[WS(rs, 2)];
Chris@82 595 TW = TU + TV;
Chris@82 596 T1s = TU - TV;
Chris@82 597 }
Chris@82 598 {
Chris@82 599 E Tn, To, T10, T11;
Chris@82 600 Tn = Rm[0];
Chris@82 601 To = Rp[WS(rs, 7)];
Chris@82 602 Tp = Tn + To;
Chris@82 603 T14 = Tn - To;
Chris@82 604 T10 = Im[0];
Chris@82 605 T11 = Ip[WS(rs, 7)];
Chris@82 606 T12 = T10 + T11;
Chris@82 607 T1u = T11 - T10;
Chris@82 608 }
Chris@82 609 {
Chris@82 610 E Tq, Tr, T15, T16;
Chris@82 611 Tq = Rp[WS(rs, 3)];
Chris@82 612 Tr = Rm[WS(rs, 4)];
Chris@82 613 Ts = Tq + Tr;
Chris@82 614 TZ = Tq - Tr;
Chris@82 615 T15 = Ip[WS(rs, 3)];
Chris@82 616 T16 = Im[WS(rs, 4)];
Chris@82 617 T17 = T15 + T16;
Chris@82 618 T1v = T15 - T16;
Chris@82 619 }
Chris@82 620 {
Chris@82 621 E Tm, Tt, T2O, T2P;
Chris@82 622 Tm = Ti + Tl;
Chris@82 623 Tt = Tp + Ts;
Chris@82 624 Tu = Tm + Tt;
Chris@82 625 T1Y = Tm - Tt;
Chris@82 626 T2O = TR - TO;
Chris@82 627 T2P = TT + TW;
Chris@82 628 T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
Chris@82 629 T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
Chris@82 630 }
Chris@82 631 {
Chris@82 632 E T2R, T2S, TS, TX;
Chris@82 633 T2R = TZ + T12;
Chris@82 634 T2S = T14 + T17;
Chris@82 635 T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
Chris@82 636 T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
Chris@82 637 TS = TO + TR;
Chris@82 638 TX = TT - TW;
Chris@82 639 TY = FMA(KP923879532, TS, KP382683432 * TX);
Chris@82 640 T1d = FNMS(KP382683432, TS, KP923879532 * TX);
Chris@82 641 }
Chris@82 642 {
Chris@82 643 E T13, T18, T2t, T2u;
Chris@82 644 T13 = TZ - T12;
Chris@82 645 T18 = T14 - T17;
Chris@82 646 T19 = FNMS(KP382683432, T18, KP923879532 * T13);
Chris@82 647 T1e = FMA(KP382683432, T13, KP923879532 * T18);
Chris@82 648 T2t = Ti - Tl;
Chris@82 649 T2u = T1r - T1s;
Chris@82 650 T2v = T2t - T2u;
Chris@82 651 T2C = T2t + T2u;
Chris@82 652 }
Chris@82 653 {
Chris@82 654 E T2w, T2x, T1t, T1w;
Chris@82 655 T2w = Tp - Ts;
Chris@82 656 T2x = T1u - T1v;
Chris@82 657 T2y = T2w + T2x;
Chris@82 658 T2D = T2x - T2w;
Chris@82 659 T1t = T1r + T1s;
Chris@82 660 T1w = T1u + T1v;
Chris@82 661 T1x = T1t + T1w;
Chris@82 662 T1V = T1w - T1t;
Chris@82 663 }
Chris@82 664 }
Chris@82 665 {
Chris@82 666 E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
Chris@82 667 Tv = Tf + Tu;
Chris@82 668 T1F = T1x + T1E;
Chris@82 669 {
Chris@82 670 E TN, T1a, T1f, T1o;
Chris@82 671 TN = TB + TM;
Chris@82 672 T1a = TY + T19;
Chris@82 673 T1b = TN + T1a;
Chris@82 674 T1N = TN - T1a;
Chris@82 675 T1f = T1d + T1e;
Chris@82 676 T1o = T1i + T1n;
Chris@82 677 T1p = T1f + T1o;
Chris@82 678 T1P = T1o - T1f;
Chris@82 679 {
Chris@82 680 E T1I, T1K, T1H, T1J;
Chris@82 681 T1I = Tf - Tu;
Chris@82 682 T1K = T1E - T1x;
Chris@82 683 T1H = W[14];
Chris@82 684 T1J = W[15];
Chris@82 685 T1L = FNMS(T1J, T1K, T1H * T1I);
Chris@82 686 T1R = FMA(T1J, T1I, T1H * T1K);
Chris@82 687 }
Chris@82 688 }
Chris@82 689 {
Chris@82 690 E T1q, T1G, Tw, T1c;
Chris@82 691 Tw = W[0];
Chris@82 692 T1c = W[1];
Chris@82 693 T1q = FMA(Tw, T1b, T1c * T1p);
Chris@82 694 T1G = FNMS(T1c, T1b, Tw * T1p);
Chris@82 695 Rp[0] = Tv - T1q;
Chris@82 696 Ip[0] = T1F + T1G;
Chris@82 697 Rm[0] = Tv + T1q;
Chris@82 698 Im[0] = T1G - T1F;
Chris@82 699 }
Chris@82 700 {
Chris@82 701 E T1Q, T1S, T1M, T1O;
Chris@82 702 T1M = W[16];
Chris@82 703 T1O = W[17];
Chris@82 704 T1Q = FMA(T1M, T1N, T1O * T1P);
Chris@82 705 T1S = FNMS(T1O, T1N, T1M * T1P);
Chris@82 706 Rp[WS(rs, 4)] = T1L - T1Q;
Chris@82 707 Ip[WS(rs, 4)] = T1R + T1S;
Chris@82 708 Rm[WS(rs, 4)] = T1L + T1Q;
Chris@82 709 Im[WS(rs, 4)] = T1S - T1R;
Chris@82 710 }
Chris@82 711 }
Chris@82 712 {
Chris@82 713 E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
Chris@82 714 {
Chris@82 715 E T23, T24, T27, T28;
Chris@82 716 T23 = TB - TM;
Chris@82 717 T24 = T1d - T1e;
Chris@82 718 T25 = T23 + T24;
Chris@82 719 T2j = T23 - T24;
Chris@82 720 T27 = T19 - TY;
Chris@82 721 T28 = T1n - T1i;
Chris@82 722 T29 = T27 + T28;
Chris@82 723 T2l = T28 - T27;
Chris@82 724 }
Chris@82 725 {
Chris@82 726 E T1W, T20, T1T, T1X;
Chris@82 727 T1W = T1U + T1V;
Chris@82 728 T20 = T1Y + T1Z;
Chris@82 729 T1T = W[6];
Chris@82 730 T1X = W[7];
Chris@82 731 T21 = FNMS(T1X, T20, T1T * T1W);
Chris@82 732 T2b = FMA(T1X, T1W, T1T * T20);
Chris@82 733 }
Chris@82 734 {
Chris@82 735 E T2e, T2g, T2d, T2f;
Chris@82 736 T2e = T1U - T1V;
Chris@82 737 T2g = T1Z - T1Y;
Chris@82 738 T2d = W[22];
Chris@82 739 T2f = W[23];
Chris@82 740 T2h = FNMS(T2f, T2g, T2d * T2e);
Chris@82 741 T2n = FMA(T2f, T2e, T2d * T2g);
Chris@82 742 }
Chris@82 743 {
Chris@82 744 E T2a, T2c, T22, T26;
Chris@82 745 T22 = W[8];
Chris@82 746 T26 = W[9];
Chris@82 747 T2a = FMA(T22, T25, T26 * T29);
Chris@82 748 T2c = FNMS(T26, T25, T22 * T29);
Chris@82 749 Rp[WS(rs, 2)] = T21 - T2a;
Chris@82 750 Ip[WS(rs, 2)] = T2b + T2c;
Chris@82 751 Rm[WS(rs, 2)] = T21 + T2a;
Chris@82 752 Im[WS(rs, 2)] = T2c - T2b;
Chris@82 753 }
Chris@82 754 {
Chris@82 755 E T2m, T2o, T2i, T2k;
Chris@82 756 T2i = W[24];
Chris@82 757 T2k = W[25];
Chris@82 758 T2m = FMA(T2i, T2j, T2k * T2l);
Chris@82 759 T2o = FNMS(T2k, T2j, T2i * T2l);
Chris@82 760 Rp[WS(rs, 6)] = T2h - T2m;
Chris@82 761 Ip[WS(rs, 6)] = T2n + T2o;
Chris@82 762 Rm[WS(rs, 6)] = T2h + T2m;
Chris@82 763 Im[WS(rs, 6)] = T2o - T2n;
Chris@82 764 }
Chris@82 765 }
Chris@82 766 {
Chris@82 767 E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
Chris@82 768 T2z = KP707106781 * (T2v + T2y);
Chris@82 769 T2A = T2s + T2z;
Chris@82 770 T38 = T2s - T2z;
Chris@82 771 T2E = KP707106781 * (T2C + T2D);
Chris@82 772 T2I = T2E + T2H;
Chris@82 773 T3a = T2H - T2E;
Chris@82 774 {
Chris@82 775 E T2N, T2U, T2Z, T32;
Chris@82 776 T2N = T2L + T2M;
Chris@82 777 T2U = T2Q - T2T;
Chris@82 778 T2V = T2N + T2U;
Chris@82 779 T3d = T2N - T2U;
Chris@82 780 T2Z = T2X + T2Y;
Chris@82 781 T32 = T30 - T31;
Chris@82 782 T33 = T2Z + T32;
Chris@82 783 T3f = T32 - T2Z;
Chris@82 784 }
Chris@82 785 {
Chris@82 786 E T2J, T35, T34, T36;
Chris@82 787 {
Chris@82 788 E T2p, T2B, T2K, T2W;
Chris@82 789 T2p = W[2];
Chris@82 790 T2B = W[3];
Chris@82 791 T2J = FNMS(T2B, T2I, T2p * T2A);
Chris@82 792 T35 = FMA(T2B, T2A, T2p * T2I);
Chris@82 793 T2K = W[4];
Chris@82 794 T2W = W[5];
Chris@82 795 T34 = FMA(T2K, T2V, T2W * T33);
Chris@82 796 T36 = FNMS(T2W, T2V, T2K * T33);
Chris@82 797 }
Chris@82 798 Rp[WS(rs, 1)] = T2J - T34;
Chris@82 799 Ip[WS(rs, 1)] = T35 + T36;
Chris@82 800 Rm[WS(rs, 1)] = T2J + T34;
Chris@82 801 Im[WS(rs, 1)] = T36 - T35;
Chris@82 802 }
Chris@82 803 {
Chris@82 804 E T3b, T3h, T3g, T3i;
Chris@82 805 {
Chris@82 806 E T37, T39, T3c, T3e;
Chris@82 807 T37 = W[18];
Chris@82 808 T39 = W[19];
Chris@82 809 T3b = FNMS(T39, T3a, T37 * T38);
Chris@82 810 T3h = FMA(T39, T38, T37 * T3a);
Chris@82 811 T3c = W[20];
Chris@82 812 T3e = W[21];
Chris@82 813 T3g = FMA(T3c, T3d, T3e * T3f);
Chris@82 814 T3i = FNMS(T3e, T3d, T3c * T3f);
Chris@82 815 }
Chris@82 816 Rp[WS(rs, 5)] = T3b - T3g;
Chris@82 817 Ip[WS(rs, 5)] = T3h + T3i;
Chris@82 818 Rm[WS(rs, 5)] = T3b + T3g;
Chris@82 819 Im[WS(rs, 5)] = T3i - T3h;
Chris@82 820 }
Chris@82 821 }
Chris@82 822 {
Chris@82 823 E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
Chris@82 824 T3l = KP707106781 * (T2D - T2C);
Chris@82 825 T3m = T3k + T3l;
Chris@82 826 T3E = T3k - T3l;
Chris@82 827 T3o = KP707106781 * (T2v - T2y);
Chris@82 828 T3q = T3o + T3p;
Chris@82 829 T3G = T3p - T3o;
Chris@82 830 {
Chris@82 831 E T3t, T3u, T3x, T3y;
Chris@82 832 T3t = T2L - T2M;
Chris@82 833 T3u = T2X - T2Y;
Chris@82 834 T3v = T3t + T3u;
Chris@82 835 T3J = T3t - T3u;
Chris@82 836 T3x = T31 + T30;
Chris@82 837 T3y = T2Q + T2T;
Chris@82 838 T3z = T3x - T3y;
Chris@82 839 T3L = T3y + T3x;
Chris@82 840 }
Chris@82 841 {
Chris@82 842 E T3r, T3B, T3A, T3C;
Chris@82 843 {
Chris@82 844 E T3j, T3n, T3s, T3w;
Chris@82 845 T3j = W[10];
Chris@82 846 T3n = W[11];
Chris@82 847 T3r = FNMS(T3n, T3q, T3j * T3m);
Chris@82 848 T3B = FMA(T3n, T3m, T3j * T3q);
Chris@82 849 T3s = W[12];
Chris@82 850 T3w = W[13];
Chris@82 851 T3A = FMA(T3s, T3v, T3w * T3z);
Chris@82 852 T3C = FNMS(T3w, T3v, T3s * T3z);
Chris@82 853 }
Chris@82 854 Rp[WS(rs, 3)] = T3r - T3A;
Chris@82 855 Ip[WS(rs, 3)] = T3B + T3C;
Chris@82 856 Rm[WS(rs, 3)] = T3r + T3A;
Chris@82 857 Im[WS(rs, 3)] = T3C - T3B;
Chris@82 858 }
Chris@82 859 {
Chris@82 860 E T3H, T3N, T3M, T3O;
Chris@82 861 {
Chris@82 862 E T3D, T3F, T3I, T3K;
Chris@82 863 T3D = W[26];
Chris@82 864 T3F = W[27];
Chris@82 865 T3H = FNMS(T3F, T3G, T3D * T3E);
Chris@82 866 T3N = FMA(T3F, T3E, T3D * T3G);
Chris@82 867 T3I = W[28];
Chris@82 868 T3K = W[29];
Chris@82 869 T3M = FMA(T3I, T3J, T3K * T3L);
Chris@82 870 T3O = FNMS(T3K, T3J, T3I * T3L);
Chris@82 871 }
Chris@82 872 Rp[WS(rs, 7)] = T3H - T3M;
Chris@82 873 Ip[WS(rs, 7)] = T3N + T3O;
Chris@82 874 Rm[WS(rs, 7)] = T3H + T3M;
Chris@82 875 Im[WS(rs, 7)] = T3O - T3N;
Chris@82 876 }
Chris@82 877 }
Chris@82 878 }
Chris@82 879 }
Chris@82 880 }
Chris@82 881
Chris@82 882 static const tw_instr twinstr[] = {
Chris@82 883 {TW_FULL, 1, 16},
Chris@82 884 {TW_NEXT, 1, 0}
Chris@82 885 };
Chris@82 886
Chris@82 887 static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, {168, 46, 38, 0} };
Chris@82 888
Chris@82 889 void X(codelet_hc2cbdft_16) (planner *p) {
Chris@82 890 X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
Chris@82 891 }
Chris@82 892 #endif