annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cbdft_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:59 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 206 FP additions, 100 FP multiplications,
Chris@42 32 * (or, 136 additions, 30 multiplications, 70 fused multiply/add),
Chris@42 33 * 97 stack variables, 3 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 42 {
Chris@42 43 INT m;
Chris@42 44 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 45 E T3w, T3z, T2Y, T3D, T3x, T3m, T3u, T3C, T3y, T3o, T3k, T3E, T3A;
Chris@42 46 {
Chris@42 47 E T20, Tf, T3Q, T32, T3V, T3f, T2a, TN, T2f, T1m, T3G, T2G, T3L, T2T, T26;
Chris@42 48 E T1F, T3M, T2N, T3H, T2W, T25, Tu, T1n, T1o, T3R, T3i, T2g, T1a, T21, T1y;
Chris@42 49 E T3W, T39;
Chris@42 50 {
Chris@42 51 E T2R, T1B, T2S, T1E;
Chris@42 52 {
Chris@42 53 E T1e, T3, T1C, TA, Tx, T6, T1D, T1h, Td, T1A, TL, T1k, Ta, TC, TF;
Chris@42 54 E T1z;
Chris@42 55 {
Chris@42 56 E T4, T5, T1f, T1g;
Chris@42 57 {
Chris@42 58 E T1, T2, Ty, Tz;
Chris@42 59 T1 = Rp[0];
Chris@42 60 T2 = Rm[WS(rs, 7)];
Chris@42 61 Ty = Ip[0];
Chris@42 62 Tz = Im[WS(rs, 7)];
Chris@42 63 T4 = Rp[WS(rs, 4)];
Chris@42 64 T1e = T1 - T2;
Chris@42 65 T3 = T1 + T2;
Chris@42 66 T1C = Ty - Tz;
Chris@42 67 TA = Ty + Tz;
Chris@42 68 T5 = Rm[WS(rs, 3)];
Chris@42 69 }
Chris@42 70 T1f = Ip[WS(rs, 4)];
Chris@42 71 T1g = Im[WS(rs, 3)];
Chris@42 72 {
Chris@42 73 E Tb, Tc, TI, TJ;
Chris@42 74 Tb = Rm[WS(rs, 1)];
Chris@42 75 Tx = T4 - T5;
Chris@42 76 T6 = T4 + T5;
Chris@42 77 T1D = T1f - T1g;
Chris@42 78 T1h = T1f + T1g;
Chris@42 79 Tc = Rp[WS(rs, 6)];
Chris@42 80 TI = Im[WS(rs, 1)];
Chris@42 81 TJ = Ip[WS(rs, 6)];
Chris@42 82 {
Chris@42 83 E T8, TH, TK, T9, TD, TE;
Chris@42 84 T8 = Rp[WS(rs, 2)];
Chris@42 85 Td = Tb + Tc;
Chris@42 86 TH = Tb - Tc;
Chris@42 87 T1A = TJ - TI;
Chris@42 88 TK = TI + TJ;
Chris@42 89 T9 = Rm[WS(rs, 5)];
Chris@42 90 TD = Ip[WS(rs, 2)];
Chris@42 91 TE = Im[WS(rs, 5)];
Chris@42 92 TL = TH + TK;
Chris@42 93 T1k = TH - TK;
Chris@42 94 Ta = T8 + T9;
Chris@42 95 TC = T8 - T9;
Chris@42 96 TF = TD + TE;
Chris@42 97 T1z = TD - TE;
Chris@42 98 }
Chris@42 99 }
Chris@42 100 }
Chris@42 101 {
Chris@42 102 E T2E, TB, T1l, T1i, T3d, T3e, TM, T2F;
Chris@42 103 {
Chris@42 104 E T7, TG, Te, T30, T31, T1j;
Chris@42 105 T2E = T3 - T6;
Chris@42 106 T7 = T3 + T6;
Chris@42 107 T1j = TC - TF;
Chris@42 108 TG = TC + TF;
Chris@42 109 Te = Ta + Td;
Chris@42 110 T2R = Ta - Td;
Chris@42 111 TB = Tx + TA;
Chris@42 112 T30 = TA - Tx;
Chris@42 113 T31 = T1j - T1k;
Chris@42 114 T1l = T1j + T1k;
Chris@42 115 T1i = T1e - T1h;
Chris@42 116 T3d = T1e + T1h;
Chris@42 117 T20 = T7 - Te;
Chris@42 118 Tf = T7 + Te;
Chris@42 119 T3Q = FNMS(KP707106781, T31, T30);
Chris@42 120 T32 = FMA(KP707106781, T31, T30);
Chris@42 121 T3e = TG + TL;
Chris@42 122 TM = TG - TL;
Chris@42 123 }
Chris@42 124 T3V = FMA(KP707106781, T3e, T3d);
Chris@42 125 T3f = FNMS(KP707106781, T3e, T3d);
Chris@42 126 T2a = FNMS(KP707106781, TM, TB);
Chris@42 127 TN = FMA(KP707106781, TM, TB);
Chris@42 128 T2F = T1A - T1z;
Chris@42 129 T1B = T1z + T1A;
Chris@42 130 T2f = FNMS(KP707106781, T1l, T1i);
Chris@42 131 T1m = FMA(KP707106781, T1l, T1i);
Chris@42 132 T3G = T2E - T2F;
Chris@42 133 T2G = T2E + T2F;
Chris@42 134 T2S = T1C - T1D;
Chris@42 135 T1E = T1C + T1D;
Chris@42 136 }
Chris@42 137 }
Chris@42 138 {
Chris@42 139 E T34, TS, T2H, Tm, T1u, T2I, T33, TX, Tq, T14, Tp, T1v, T12, Tr, T15;
Chris@42 140 E T16;
Chris@42 141 {
Chris@42 142 E Tj, TT, Ti, T1s, TR, Tk, TU, TV;
Chris@42 143 {
Chris@42 144 E Tg, Th, TP, TQ;
Chris@42 145 Tg = Rp[WS(rs, 1)];
Chris@42 146 T3L = T2S - T2R;
Chris@42 147 T2T = T2R + T2S;
Chris@42 148 T26 = T1E - T1B;
Chris@42 149 T1F = T1B + T1E;
Chris@42 150 Th = Rm[WS(rs, 6)];
Chris@42 151 TP = Ip[WS(rs, 1)];
Chris@42 152 TQ = Im[WS(rs, 6)];
Chris@42 153 Tj = Rp[WS(rs, 5)];
Chris@42 154 TT = Tg - Th;
Chris@42 155 Ti = Tg + Th;
Chris@42 156 T1s = TP - TQ;
Chris@42 157 TR = TP + TQ;
Chris@42 158 Tk = Rm[WS(rs, 2)];
Chris@42 159 TU = Ip[WS(rs, 5)];
Chris@42 160 TV = Im[WS(rs, 2)];
Chris@42 161 }
Chris@42 162 {
Chris@42 163 E Tn, To, T10, T11;
Chris@42 164 Tn = Rm[0];
Chris@42 165 {
Chris@42 166 E TO, Tl, T1t, TW;
Chris@42 167 TO = Tj - Tk;
Chris@42 168 Tl = Tj + Tk;
Chris@42 169 T1t = TU - TV;
Chris@42 170 TW = TU + TV;
Chris@42 171 T34 = TR - TO;
Chris@42 172 TS = TO + TR;
Chris@42 173 T2H = Ti - Tl;
Chris@42 174 Tm = Ti + Tl;
Chris@42 175 T1u = T1s + T1t;
Chris@42 176 T2I = T1s - T1t;
Chris@42 177 T33 = TT + TW;
Chris@42 178 TX = TT - TW;
Chris@42 179 To = Rp[WS(rs, 7)];
Chris@42 180 }
Chris@42 181 T10 = Im[0];
Chris@42 182 T11 = Ip[WS(rs, 7)];
Chris@42 183 Tq = Rp[WS(rs, 3)];
Chris@42 184 T14 = Tn - To;
Chris@42 185 Tp = Tn + To;
Chris@42 186 T1v = T11 - T10;
Chris@42 187 T12 = T10 + T11;
Chris@42 188 Tr = Rm[WS(rs, 4)];
Chris@42 189 T15 = Ip[WS(rs, 3)];
Chris@42 190 T16 = Im[WS(rs, 4)];
Chris@42 191 }
Chris@42 192 }
Chris@42 193 {
Chris@42 194 E T13, T1x, T18, T35, T3g, T3h, T38, TY, T19;
Chris@42 195 {
Chris@42 196 E T2U, T2J, T37, Tt, T36, T2V, T2M, T2K, T2L;
Chris@42 197 T2U = T2H + T2I;
Chris@42 198 T2J = T2H - T2I;
Chris@42 199 {
Chris@42 200 E TZ, Ts, T1w, T17;
Chris@42 201 TZ = Tq - Tr;
Chris@42 202 Ts = Tq + Tr;
Chris@42 203 T1w = T15 - T16;
Chris@42 204 T17 = T15 + T16;
Chris@42 205 T37 = TZ + T12;
Chris@42 206 T13 = TZ - T12;
Chris@42 207 T2K = Tp - Ts;
Chris@42 208 Tt = Tp + Ts;
Chris@42 209 T1x = T1v + T1w;
Chris@42 210 T2L = T1v - T1w;
Chris@42 211 T36 = T14 + T17;
Chris@42 212 T18 = T14 - T17;
Chris@42 213 }
Chris@42 214 T2V = T2L - T2K;
Chris@42 215 T2M = T2K + T2L;
Chris@42 216 T3M = T2J - T2M;
Chris@42 217 T2N = T2J + T2M;
Chris@42 218 T3H = T2V - T2U;
Chris@42 219 T2W = T2U + T2V;
Chris@42 220 T35 = FMA(KP414213562, T34, T33);
Chris@42 221 T3g = FNMS(KP414213562, T33, T34);
Chris@42 222 T25 = Tm - Tt;
Chris@42 223 Tu = Tm + Tt;
Chris@42 224 T3h = FNMS(KP414213562, T36, T37);
Chris@42 225 T38 = FMA(KP414213562, T37, T36);
Chris@42 226 }
Chris@42 227 T1n = FNMS(KP414213562, TS, TX);
Chris@42 228 TY = FMA(KP414213562, TX, TS);
Chris@42 229 T19 = FNMS(KP414213562, T18, T13);
Chris@42 230 T1o = FMA(KP414213562, T13, T18);
Chris@42 231 T3R = T3h - T3g;
Chris@42 232 T3i = T3g + T3h;
Chris@42 233 T2g = T19 - TY;
Chris@42 234 T1a = TY + T19;
Chris@42 235 T21 = T1x - T1u;
Chris@42 236 T1y = T1u + T1x;
Chris@42 237 T3W = T35 + T38;
Chris@42 238 T39 = T35 - T38;
Chris@42 239 }
Chris@42 240 }
Chris@42 241 }
Chris@42 242 {
Chris@42 243 E T27, T22, T2c, T2u, T2x, T2h, T2s, T2A, T2w, T2B, T2v;
Chris@42 244 {
Chris@42 245 E T1K, Tv, T1G, T1N, T1Q, T1b, T2b, T1p, Tw, T1d;
Chris@42 246 T1K = Tf - Tu;
Chris@42 247 Tv = Tf + Tu;
Chris@42 248 T1G = T1y + T1F;
Chris@42 249 T1N = T1F - T1y;
Chris@42 250 T1Q = FNMS(KP923879532, T1a, TN);
Chris@42 251 T1b = FMA(KP923879532, T1a, TN);
Chris@42 252 T2b = T1n - T1o;
Chris@42 253 T1p = T1n + T1o;
Chris@42 254 Tw = W[0];
Chris@42 255 T1d = W[1];
Chris@42 256 {
Chris@42 257 E T1T, T1O, T1W, T1S, T1X, T1R;
Chris@42 258 {
Chris@42 259 E T1J, T1M, T1L, T1V, T1P, T1q;
Chris@42 260 T1T = FNMS(KP923879532, T1p, T1m);
Chris@42 261 T1q = FMA(KP923879532, T1p, T1m);
Chris@42 262 {
Chris@42 263 E T1c, T1I, T1H, T1r;
Chris@42 264 T1c = Tw * T1b;
Chris@42 265 T1J = W[14];
Chris@42 266 T1H = Tw * T1q;
Chris@42 267 T1r = FMA(T1d, T1q, T1c);
Chris@42 268 T1M = W[15];
Chris@42 269 T1L = T1J * T1K;
Chris@42 270 T1I = FNMS(T1d, T1b, T1H);
Chris@42 271 Rm[0] = Tv + T1r;
Chris@42 272 Rp[0] = Tv - T1r;
Chris@42 273 T1V = T1M * T1K;
Chris@42 274 Im[0] = T1I - T1G;
Chris@42 275 Ip[0] = T1G + T1I;
Chris@42 276 T1P = W[16];
Chris@42 277 }
Chris@42 278 T1O = FNMS(T1M, T1N, T1L);
Chris@42 279 T1W = FMA(T1J, T1N, T1V);
Chris@42 280 T1S = W[17];
Chris@42 281 T1X = T1P * T1T;
Chris@42 282 T1R = T1P * T1Q;
Chris@42 283 }
Chris@42 284 {
Chris@42 285 E T2r, T2n, T2q, T2p, T2z, T2t, T2o, T1Y, T1U;
Chris@42 286 T27 = T25 + T26;
Chris@42 287 T2r = T26 - T25;
Chris@42 288 T2o = T20 - T21;
Chris@42 289 T22 = T20 + T21;
Chris@42 290 T1Y = FNMS(T1S, T1Q, T1X);
Chris@42 291 T1U = FMA(T1S, T1T, T1R);
Chris@42 292 T2n = W[22];
Chris@42 293 T2q = W[23];
Chris@42 294 Im[WS(rs, 4)] = T1Y - T1W;
Chris@42 295 Ip[WS(rs, 4)] = T1W + T1Y;
Chris@42 296 Rm[WS(rs, 4)] = T1O + T1U;
Chris@42 297 Rp[WS(rs, 4)] = T1O - T1U;
Chris@42 298 T2p = T2n * T2o;
Chris@42 299 T2z = T2q * T2o;
Chris@42 300 T2c = FMA(KP923879532, T2b, T2a);
Chris@42 301 T2u = FNMS(KP923879532, T2b, T2a);
Chris@42 302 T2x = FNMS(KP923879532, T2g, T2f);
Chris@42 303 T2h = FMA(KP923879532, T2g, T2f);
Chris@42 304 T2t = W[24];
Chris@42 305 T2s = FNMS(T2q, T2r, T2p);
Chris@42 306 T2A = FMA(T2n, T2r, T2z);
Chris@42 307 T2w = W[25];
Chris@42 308 T2B = T2t * T2x;
Chris@42 309 T2v = T2t * T2u;
Chris@42 310 }
Chris@42 311 }
Chris@42 312 }
Chris@42 313 {
Chris@42 314 E T28, T2k, T2e, T2l, T2d;
Chris@42 315 {
Chris@42 316 E T1Z, T24, T23, T2j, T29, T2C, T2y;
Chris@42 317 T2C = FNMS(T2w, T2u, T2B);
Chris@42 318 T2y = FMA(T2w, T2x, T2v);
Chris@42 319 T1Z = W[6];
Chris@42 320 T24 = W[7];
Chris@42 321 Im[WS(rs, 6)] = T2C - T2A;
Chris@42 322 Ip[WS(rs, 6)] = T2A + T2C;
Chris@42 323 Rm[WS(rs, 6)] = T2s + T2y;
Chris@42 324 Rp[WS(rs, 6)] = T2s - T2y;
Chris@42 325 T23 = T1Z * T22;
Chris@42 326 T2j = T24 * T22;
Chris@42 327 T29 = W[8];
Chris@42 328 T28 = FNMS(T24, T27, T23);
Chris@42 329 T2k = FMA(T1Z, T27, T2j);
Chris@42 330 T2e = W[9];
Chris@42 331 T2l = T29 * T2h;
Chris@42 332 T2d = T29 * T2c;
Chris@42 333 }
Chris@42 334 {
Chris@42 335 E T4a, T4d, T3O, T4h, T4b, T40, T48, T4g, T4c, T42, T3Y;
Chris@42 336 {
Chris@42 337 E T3N, T47, T43, T46, T3F, T45, T4f, T3K, T3J, T3S, T3X, T3Z, T49, T41, T3T;
Chris@42 338 E T3U;
Chris@42 339 {
Chris@42 340 E T44, T3I, T2m, T2i, T3P;
Chris@42 341 T44 = FNMS(KP707106781, T3H, T3G);
Chris@42 342 T3I = FMA(KP707106781, T3H, T3G);
Chris@42 343 T2m = FNMS(T2e, T2c, T2l);
Chris@42 344 T2i = FMA(T2e, T2h, T2d);
Chris@42 345 T3N = FMA(KP707106781, T3M, T3L);
Chris@42 346 T47 = FNMS(KP707106781, T3M, T3L);
Chris@42 347 Im[WS(rs, 2)] = T2m - T2k;
Chris@42 348 Ip[WS(rs, 2)] = T2k + T2m;
Chris@42 349 Rm[WS(rs, 2)] = T28 + T2i;
Chris@42 350 Rp[WS(rs, 2)] = T28 - T2i;
Chris@42 351 T43 = W[26];
Chris@42 352 T46 = W[27];
Chris@42 353 T3F = W[10];
Chris@42 354 T45 = T43 * T44;
Chris@42 355 T4f = T46 * T44;
Chris@42 356 T3K = W[11];
Chris@42 357 T3J = T3F * T3I;
Chris@42 358 T4a = FNMS(KP923879532, T3R, T3Q);
Chris@42 359 T3S = FMA(KP923879532, T3R, T3Q);
Chris@42 360 T3X = FNMS(KP923879532, T3W, T3V);
Chris@42 361 T4d = FMA(KP923879532, T3W, T3V);
Chris@42 362 T3Z = T3K * T3I;
Chris@42 363 T3P = W[12];
Chris@42 364 T49 = W[28];
Chris@42 365 T41 = T3P * T3X;
Chris@42 366 T3T = T3P * T3S;
Chris@42 367 }
Chris@42 368 T3O = FNMS(T3K, T3N, T3J);
Chris@42 369 T4h = T49 * T4d;
Chris@42 370 T4b = T49 * T4a;
Chris@42 371 T40 = FMA(T3F, T3N, T3Z);
Chris@42 372 T3U = W[13];
Chris@42 373 T48 = FNMS(T46, T47, T45);
Chris@42 374 T4g = FMA(T43, T47, T4f);
Chris@42 375 T4c = W[29];
Chris@42 376 T42 = FNMS(T3U, T3S, T41);
Chris@42 377 T3Y = FMA(T3U, T3X, T3T);
Chris@42 378 }
Chris@42 379 {
Chris@42 380 E T3t, T2X, T3p, T3s, T2D, T3r, T3B, T2Q, T2P, T3a, T3j, T3l, T3v, T3n, T3b;
Chris@42 381 E T3c;
Chris@42 382 {
Chris@42 383 E T2O, T3q, T4i, T4e, T2Z;
Chris@42 384 T4i = FNMS(T4c, T4a, T4h);
Chris@42 385 T4e = FMA(T4c, T4d, T4b);
Chris@42 386 Im[WS(rs, 3)] = T42 - T40;
Chris@42 387 Ip[WS(rs, 3)] = T40 + T42;
Chris@42 388 Rm[WS(rs, 3)] = T3O + T3Y;
Chris@42 389 Rp[WS(rs, 3)] = T3O - T3Y;
Chris@42 390 Im[WS(rs, 7)] = T4i - T4g;
Chris@42 391 Ip[WS(rs, 7)] = T4g + T4i;
Chris@42 392 Rm[WS(rs, 7)] = T48 + T4e;
Chris@42 393 Rp[WS(rs, 7)] = T48 - T4e;
Chris@42 394 T3t = FNMS(KP707106781, T2W, T2T);
Chris@42 395 T2X = FMA(KP707106781, T2W, T2T);
Chris@42 396 T2O = FMA(KP707106781, T2N, T2G);
Chris@42 397 T3q = FNMS(KP707106781, T2N, T2G);
Chris@42 398 T3p = W[18];
Chris@42 399 T3s = W[19];
Chris@42 400 T2D = W[2];
Chris@42 401 T3r = T3p * T3q;
Chris@42 402 T3B = T3s * T3q;
Chris@42 403 T2Q = W[3];
Chris@42 404 T2P = T2D * T2O;
Chris@42 405 T3a = FMA(KP923879532, T39, T32);
Chris@42 406 T3w = FNMS(KP923879532, T39, T32);
Chris@42 407 T3z = FMA(KP923879532, T3i, T3f);
Chris@42 408 T3j = FNMS(KP923879532, T3i, T3f);
Chris@42 409 T3l = T2Q * T2O;
Chris@42 410 T2Z = W[4];
Chris@42 411 T3v = W[20];
Chris@42 412 T3n = T2Z * T3j;
Chris@42 413 T3b = T2Z * T3a;
Chris@42 414 }
Chris@42 415 T2Y = FNMS(T2Q, T2X, T2P);
Chris@42 416 T3D = T3v * T3z;
Chris@42 417 T3x = T3v * T3w;
Chris@42 418 T3m = FMA(T2D, T2X, T3l);
Chris@42 419 T3c = W[5];
Chris@42 420 T3u = FNMS(T3s, T3t, T3r);
Chris@42 421 T3C = FMA(T3p, T3t, T3B);
Chris@42 422 T3y = W[21];
Chris@42 423 T3o = FNMS(T3c, T3a, T3n);
Chris@42 424 T3k = FMA(T3c, T3j, T3b);
Chris@42 425 }
Chris@42 426 }
Chris@42 427 }
Chris@42 428 }
Chris@42 429 }
Chris@42 430 T3E = FNMS(T3y, T3w, T3D);
Chris@42 431 T3A = FMA(T3y, T3z, T3x);
Chris@42 432 Im[WS(rs, 1)] = T3o - T3m;
Chris@42 433 Ip[WS(rs, 1)] = T3m + T3o;
Chris@42 434 Rm[WS(rs, 1)] = T2Y + T3k;
Chris@42 435 Rp[WS(rs, 1)] = T2Y - T3k;
Chris@42 436 Im[WS(rs, 5)] = T3E - T3C;
Chris@42 437 Ip[WS(rs, 5)] = T3C + T3E;
Chris@42 438 Rm[WS(rs, 5)] = T3u + T3A;
Chris@42 439 Rp[WS(rs, 5)] = T3u - T3A;
Chris@42 440 }
Chris@42 441 }
Chris@42 442 }
Chris@42 443
Chris@42 444 static const tw_instr twinstr[] = {
Chris@42 445 {TW_FULL, 1, 16},
Chris@42 446 {TW_NEXT, 1, 0}
Chris@42 447 };
Chris@42 448
Chris@42 449 static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, {136, 30, 70, 0} };
Chris@42 450
Chris@42 451 void X(codelet_hc2cbdft_16) (planner *p) {
Chris@42 452 X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
Chris@42 453 }
Chris@42 454 #else /* HAVE_FMA */
Chris@42 455
Chris@42 456 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hc2cbdft_16 -include hc2cb.h */
Chris@42 457
Chris@42 458 /*
Chris@42 459 * This function contains 206 FP additions, 84 FP multiplications,
Chris@42 460 * (or, 168 additions, 46 multiplications, 38 fused multiply/add),
Chris@42 461 * 60 stack variables, 3 constants, and 64 memory accesses
Chris@42 462 */
Chris@42 463 #include "hc2cb.h"
Chris@42 464
Chris@42 465 static void hc2cbdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 466 {
Chris@42 467 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 468 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 469 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 470 {
Chris@42 471 INT m;
Chris@42 472 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 473 E TB, T2L, T30, T1n, Tf, T1U, T2H, T3p, T1E, T1Z, TM, T31, T2s, T3k, T1i;
Chris@42 474 E T2M, Tu, T1Y, T2Q, T2X, T2T, T2Y, TY, T1d, T19, T1e, T2v, T2C, T2y, T2D;
Chris@42 475 E T1x, T1V;
Chris@42 476 {
Chris@42 477 E T3, T1j, TA, T1B, T6, Tx, T1m, T1C, Ta, TC, TF, T1y, Td, TH, TK;
Chris@42 478 E T1z;
Chris@42 479 {
Chris@42 480 E T1, T2, Ty, Tz;
Chris@42 481 T1 = Rp[0];
Chris@42 482 T2 = Rm[WS(rs, 7)];
Chris@42 483 T3 = T1 + T2;
Chris@42 484 T1j = T1 - T2;
Chris@42 485 Ty = Ip[0];
Chris@42 486 Tz = Im[WS(rs, 7)];
Chris@42 487 TA = Ty + Tz;
Chris@42 488 T1B = Ty - Tz;
Chris@42 489 }
Chris@42 490 {
Chris@42 491 E T4, T5, T1k, T1l;
Chris@42 492 T4 = Rp[WS(rs, 4)];
Chris@42 493 T5 = Rm[WS(rs, 3)];
Chris@42 494 T6 = T4 + T5;
Chris@42 495 Tx = T4 - T5;
Chris@42 496 T1k = Ip[WS(rs, 4)];
Chris@42 497 T1l = Im[WS(rs, 3)];
Chris@42 498 T1m = T1k + T1l;
Chris@42 499 T1C = T1k - T1l;
Chris@42 500 }
Chris@42 501 {
Chris@42 502 E T8, T9, TD, TE;
Chris@42 503 T8 = Rp[WS(rs, 2)];
Chris@42 504 T9 = Rm[WS(rs, 5)];
Chris@42 505 Ta = T8 + T9;
Chris@42 506 TC = T8 - T9;
Chris@42 507 TD = Ip[WS(rs, 2)];
Chris@42 508 TE = Im[WS(rs, 5)];
Chris@42 509 TF = TD + TE;
Chris@42 510 T1y = TD - TE;
Chris@42 511 }
Chris@42 512 {
Chris@42 513 E Tb, Tc, TI, TJ;
Chris@42 514 Tb = Rm[WS(rs, 1)];
Chris@42 515 Tc = Rp[WS(rs, 6)];
Chris@42 516 Td = Tb + Tc;
Chris@42 517 TH = Tb - Tc;
Chris@42 518 TI = Im[WS(rs, 1)];
Chris@42 519 TJ = Ip[WS(rs, 6)];
Chris@42 520 TK = TI + TJ;
Chris@42 521 T1z = TJ - TI;
Chris@42 522 }
Chris@42 523 {
Chris@42 524 E T7, Te, TG, TL;
Chris@42 525 TB = Tx + TA;
Chris@42 526 T2L = TA - Tx;
Chris@42 527 T30 = T1j + T1m;
Chris@42 528 T1n = T1j - T1m;
Chris@42 529 T7 = T3 + T6;
Chris@42 530 Te = Ta + Td;
Chris@42 531 Tf = T7 + Te;
Chris@42 532 T1U = T7 - Te;
Chris@42 533 {
Chris@42 534 E T2F, T2G, T1A, T1D;
Chris@42 535 T2F = Ta - Td;
Chris@42 536 T2G = T1B - T1C;
Chris@42 537 T2H = T2F + T2G;
Chris@42 538 T3p = T2G - T2F;
Chris@42 539 T1A = T1y + T1z;
Chris@42 540 T1D = T1B + T1C;
Chris@42 541 T1E = T1A + T1D;
Chris@42 542 T1Z = T1D - T1A;
Chris@42 543 }
Chris@42 544 TG = TC + TF;
Chris@42 545 TL = TH + TK;
Chris@42 546 TM = KP707106781 * (TG - TL);
Chris@42 547 T31 = KP707106781 * (TG + TL);
Chris@42 548 {
Chris@42 549 E T2q, T2r, T1g, T1h;
Chris@42 550 T2q = T3 - T6;
Chris@42 551 T2r = T1z - T1y;
Chris@42 552 T2s = T2q + T2r;
Chris@42 553 T3k = T2q - T2r;
Chris@42 554 T1g = TC - TF;
Chris@42 555 T1h = TH - TK;
Chris@42 556 T1i = KP707106781 * (T1g + T1h);
Chris@42 557 T2M = KP707106781 * (T1g - T1h);
Chris@42 558 }
Chris@42 559 }
Chris@42 560 }
Chris@42 561 {
Chris@42 562 E Ti, TT, TR, T1r, Tl, TO, TW, T1s, Tp, T14, T12, T1u, Ts, TZ, T17;
Chris@42 563 E T1v;
Chris@42 564 {
Chris@42 565 E Tg, Th, TP, TQ;
Chris@42 566 Tg = Rp[WS(rs, 1)];
Chris@42 567 Th = Rm[WS(rs, 6)];
Chris@42 568 Ti = Tg + Th;
Chris@42 569 TT = Tg - Th;
Chris@42 570 TP = Ip[WS(rs, 1)];
Chris@42 571 TQ = Im[WS(rs, 6)];
Chris@42 572 TR = TP + TQ;
Chris@42 573 T1r = TP - TQ;
Chris@42 574 }
Chris@42 575 {
Chris@42 576 E Tj, Tk, TU, TV;
Chris@42 577 Tj = Rp[WS(rs, 5)];
Chris@42 578 Tk = Rm[WS(rs, 2)];
Chris@42 579 Tl = Tj + Tk;
Chris@42 580 TO = Tj - Tk;
Chris@42 581 TU = Ip[WS(rs, 5)];
Chris@42 582 TV = Im[WS(rs, 2)];
Chris@42 583 TW = TU + TV;
Chris@42 584 T1s = TU - TV;
Chris@42 585 }
Chris@42 586 {
Chris@42 587 E Tn, To, T10, T11;
Chris@42 588 Tn = Rm[0];
Chris@42 589 To = Rp[WS(rs, 7)];
Chris@42 590 Tp = Tn + To;
Chris@42 591 T14 = Tn - To;
Chris@42 592 T10 = Im[0];
Chris@42 593 T11 = Ip[WS(rs, 7)];
Chris@42 594 T12 = T10 + T11;
Chris@42 595 T1u = T11 - T10;
Chris@42 596 }
Chris@42 597 {
Chris@42 598 E Tq, Tr, T15, T16;
Chris@42 599 Tq = Rp[WS(rs, 3)];
Chris@42 600 Tr = Rm[WS(rs, 4)];
Chris@42 601 Ts = Tq + Tr;
Chris@42 602 TZ = Tq - Tr;
Chris@42 603 T15 = Ip[WS(rs, 3)];
Chris@42 604 T16 = Im[WS(rs, 4)];
Chris@42 605 T17 = T15 + T16;
Chris@42 606 T1v = T15 - T16;
Chris@42 607 }
Chris@42 608 {
Chris@42 609 E Tm, Tt, T2O, T2P;
Chris@42 610 Tm = Ti + Tl;
Chris@42 611 Tt = Tp + Ts;
Chris@42 612 Tu = Tm + Tt;
Chris@42 613 T1Y = Tm - Tt;
Chris@42 614 T2O = TR - TO;
Chris@42 615 T2P = TT + TW;
Chris@42 616 T2Q = FMA(KP382683432, T2O, KP923879532 * T2P);
Chris@42 617 T2X = FNMS(KP923879532, T2O, KP382683432 * T2P);
Chris@42 618 }
Chris@42 619 {
Chris@42 620 E T2R, T2S, TS, TX;
Chris@42 621 T2R = TZ + T12;
Chris@42 622 T2S = T14 + T17;
Chris@42 623 T2T = FMA(KP382683432, T2R, KP923879532 * T2S);
Chris@42 624 T2Y = FNMS(KP923879532, T2R, KP382683432 * T2S);
Chris@42 625 TS = TO + TR;
Chris@42 626 TX = TT - TW;
Chris@42 627 TY = FMA(KP923879532, TS, KP382683432 * TX);
Chris@42 628 T1d = FNMS(KP382683432, TS, KP923879532 * TX);
Chris@42 629 }
Chris@42 630 {
Chris@42 631 E T13, T18, T2t, T2u;
Chris@42 632 T13 = TZ - T12;
Chris@42 633 T18 = T14 - T17;
Chris@42 634 T19 = FNMS(KP382683432, T18, KP923879532 * T13);
Chris@42 635 T1e = FMA(KP382683432, T13, KP923879532 * T18);
Chris@42 636 T2t = Ti - Tl;
Chris@42 637 T2u = T1r - T1s;
Chris@42 638 T2v = T2t - T2u;
Chris@42 639 T2C = T2t + T2u;
Chris@42 640 }
Chris@42 641 {
Chris@42 642 E T2w, T2x, T1t, T1w;
Chris@42 643 T2w = Tp - Ts;
Chris@42 644 T2x = T1u - T1v;
Chris@42 645 T2y = T2w + T2x;
Chris@42 646 T2D = T2x - T2w;
Chris@42 647 T1t = T1r + T1s;
Chris@42 648 T1w = T1u + T1v;
Chris@42 649 T1x = T1t + T1w;
Chris@42 650 T1V = T1w - T1t;
Chris@42 651 }
Chris@42 652 }
Chris@42 653 {
Chris@42 654 E Tv, T1F, T1b, T1N, T1p, T1P, T1L, T1R;
Chris@42 655 Tv = Tf + Tu;
Chris@42 656 T1F = T1x + T1E;
Chris@42 657 {
Chris@42 658 E TN, T1a, T1f, T1o;
Chris@42 659 TN = TB + TM;
Chris@42 660 T1a = TY + T19;
Chris@42 661 T1b = TN + T1a;
Chris@42 662 T1N = TN - T1a;
Chris@42 663 T1f = T1d + T1e;
Chris@42 664 T1o = T1i + T1n;
Chris@42 665 T1p = T1f + T1o;
Chris@42 666 T1P = T1o - T1f;
Chris@42 667 {
Chris@42 668 E T1I, T1K, T1H, T1J;
Chris@42 669 T1I = Tf - Tu;
Chris@42 670 T1K = T1E - T1x;
Chris@42 671 T1H = W[14];
Chris@42 672 T1J = W[15];
Chris@42 673 T1L = FNMS(T1J, T1K, T1H * T1I);
Chris@42 674 T1R = FMA(T1J, T1I, T1H * T1K);
Chris@42 675 }
Chris@42 676 }
Chris@42 677 {
Chris@42 678 E T1q, T1G, Tw, T1c;
Chris@42 679 Tw = W[0];
Chris@42 680 T1c = W[1];
Chris@42 681 T1q = FMA(Tw, T1b, T1c * T1p);
Chris@42 682 T1G = FNMS(T1c, T1b, Tw * T1p);
Chris@42 683 Rp[0] = Tv - T1q;
Chris@42 684 Ip[0] = T1F + T1G;
Chris@42 685 Rm[0] = Tv + T1q;
Chris@42 686 Im[0] = T1G - T1F;
Chris@42 687 }
Chris@42 688 {
Chris@42 689 E T1Q, T1S, T1M, T1O;
Chris@42 690 T1M = W[16];
Chris@42 691 T1O = W[17];
Chris@42 692 T1Q = FMA(T1M, T1N, T1O * T1P);
Chris@42 693 T1S = FNMS(T1O, T1N, T1M * T1P);
Chris@42 694 Rp[WS(rs, 4)] = T1L - T1Q;
Chris@42 695 Ip[WS(rs, 4)] = T1R + T1S;
Chris@42 696 Rm[WS(rs, 4)] = T1L + T1Q;
Chris@42 697 Im[WS(rs, 4)] = T1S - T1R;
Chris@42 698 }
Chris@42 699 }
Chris@42 700 {
Chris@42 701 E T25, T2j, T29, T2l, T21, T2b, T2h, T2n;
Chris@42 702 {
Chris@42 703 E T23, T24, T27, T28;
Chris@42 704 T23 = TB - TM;
Chris@42 705 T24 = T1d - T1e;
Chris@42 706 T25 = T23 + T24;
Chris@42 707 T2j = T23 - T24;
Chris@42 708 T27 = T19 - TY;
Chris@42 709 T28 = T1n - T1i;
Chris@42 710 T29 = T27 + T28;
Chris@42 711 T2l = T28 - T27;
Chris@42 712 }
Chris@42 713 {
Chris@42 714 E T1W, T20, T1T, T1X;
Chris@42 715 T1W = T1U + T1V;
Chris@42 716 T20 = T1Y + T1Z;
Chris@42 717 T1T = W[6];
Chris@42 718 T1X = W[7];
Chris@42 719 T21 = FNMS(T1X, T20, T1T * T1W);
Chris@42 720 T2b = FMA(T1X, T1W, T1T * T20);
Chris@42 721 }
Chris@42 722 {
Chris@42 723 E T2e, T2g, T2d, T2f;
Chris@42 724 T2e = T1U - T1V;
Chris@42 725 T2g = T1Z - T1Y;
Chris@42 726 T2d = W[22];
Chris@42 727 T2f = W[23];
Chris@42 728 T2h = FNMS(T2f, T2g, T2d * T2e);
Chris@42 729 T2n = FMA(T2f, T2e, T2d * T2g);
Chris@42 730 }
Chris@42 731 {
Chris@42 732 E T2a, T2c, T22, T26;
Chris@42 733 T22 = W[8];
Chris@42 734 T26 = W[9];
Chris@42 735 T2a = FMA(T22, T25, T26 * T29);
Chris@42 736 T2c = FNMS(T26, T25, T22 * T29);
Chris@42 737 Rp[WS(rs, 2)] = T21 - T2a;
Chris@42 738 Ip[WS(rs, 2)] = T2b + T2c;
Chris@42 739 Rm[WS(rs, 2)] = T21 + T2a;
Chris@42 740 Im[WS(rs, 2)] = T2c - T2b;
Chris@42 741 }
Chris@42 742 {
Chris@42 743 E T2m, T2o, T2i, T2k;
Chris@42 744 T2i = W[24];
Chris@42 745 T2k = W[25];
Chris@42 746 T2m = FMA(T2i, T2j, T2k * T2l);
Chris@42 747 T2o = FNMS(T2k, T2j, T2i * T2l);
Chris@42 748 Rp[WS(rs, 6)] = T2h - T2m;
Chris@42 749 Ip[WS(rs, 6)] = T2n + T2o;
Chris@42 750 Rm[WS(rs, 6)] = T2h + T2m;
Chris@42 751 Im[WS(rs, 6)] = T2o - T2n;
Chris@42 752 }
Chris@42 753 }
Chris@42 754 {
Chris@42 755 E T2A, T38, T2I, T3a, T2V, T3d, T33, T3f, T2z, T2E;
Chris@42 756 T2z = KP707106781 * (T2v + T2y);
Chris@42 757 T2A = T2s + T2z;
Chris@42 758 T38 = T2s - T2z;
Chris@42 759 T2E = KP707106781 * (T2C + T2D);
Chris@42 760 T2I = T2E + T2H;
Chris@42 761 T3a = T2H - T2E;
Chris@42 762 {
Chris@42 763 E T2N, T2U, T2Z, T32;
Chris@42 764 T2N = T2L + T2M;
Chris@42 765 T2U = T2Q - T2T;
Chris@42 766 T2V = T2N + T2U;
Chris@42 767 T3d = T2N - T2U;
Chris@42 768 T2Z = T2X + T2Y;
Chris@42 769 T32 = T30 - T31;
Chris@42 770 T33 = T2Z + T32;
Chris@42 771 T3f = T32 - T2Z;
Chris@42 772 }
Chris@42 773 {
Chris@42 774 E T2J, T35, T34, T36;
Chris@42 775 {
Chris@42 776 E T2p, T2B, T2K, T2W;
Chris@42 777 T2p = W[2];
Chris@42 778 T2B = W[3];
Chris@42 779 T2J = FNMS(T2B, T2I, T2p * T2A);
Chris@42 780 T35 = FMA(T2B, T2A, T2p * T2I);
Chris@42 781 T2K = W[4];
Chris@42 782 T2W = W[5];
Chris@42 783 T34 = FMA(T2K, T2V, T2W * T33);
Chris@42 784 T36 = FNMS(T2W, T2V, T2K * T33);
Chris@42 785 }
Chris@42 786 Rp[WS(rs, 1)] = T2J - T34;
Chris@42 787 Ip[WS(rs, 1)] = T35 + T36;
Chris@42 788 Rm[WS(rs, 1)] = T2J + T34;
Chris@42 789 Im[WS(rs, 1)] = T36 - T35;
Chris@42 790 }
Chris@42 791 {
Chris@42 792 E T3b, T3h, T3g, T3i;
Chris@42 793 {
Chris@42 794 E T37, T39, T3c, T3e;
Chris@42 795 T37 = W[18];
Chris@42 796 T39 = W[19];
Chris@42 797 T3b = FNMS(T39, T3a, T37 * T38);
Chris@42 798 T3h = FMA(T39, T38, T37 * T3a);
Chris@42 799 T3c = W[20];
Chris@42 800 T3e = W[21];
Chris@42 801 T3g = FMA(T3c, T3d, T3e * T3f);
Chris@42 802 T3i = FNMS(T3e, T3d, T3c * T3f);
Chris@42 803 }
Chris@42 804 Rp[WS(rs, 5)] = T3b - T3g;
Chris@42 805 Ip[WS(rs, 5)] = T3h + T3i;
Chris@42 806 Rm[WS(rs, 5)] = T3b + T3g;
Chris@42 807 Im[WS(rs, 5)] = T3i - T3h;
Chris@42 808 }
Chris@42 809 }
Chris@42 810 {
Chris@42 811 E T3m, T3E, T3q, T3G, T3v, T3J, T3z, T3L, T3l, T3o;
Chris@42 812 T3l = KP707106781 * (T2D - T2C);
Chris@42 813 T3m = T3k + T3l;
Chris@42 814 T3E = T3k - T3l;
Chris@42 815 T3o = KP707106781 * (T2v - T2y);
Chris@42 816 T3q = T3o + T3p;
Chris@42 817 T3G = T3p - T3o;
Chris@42 818 {
Chris@42 819 E T3t, T3u, T3x, T3y;
Chris@42 820 T3t = T2L - T2M;
Chris@42 821 T3u = T2X - T2Y;
Chris@42 822 T3v = T3t + T3u;
Chris@42 823 T3J = T3t - T3u;
Chris@42 824 T3x = T31 + T30;
Chris@42 825 T3y = T2Q + T2T;
Chris@42 826 T3z = T3x - T3y;
Chris@42 827 T3L = T3y + T3x;
Chris@42 828 }
Chris@42 829 {
Chris@42 830 E T3r, T3B, T3A, T3C;
Chris@42 831 {
Chris@42 832 E T3j, T3n, T3s, T3w;
Chris@42 833 T3j = W[10];
Chris@42 834 T3n = W[11];
Chris@42 835 T3r = FNMS(T3n, T3q, T3j * T3m);
Chris@42 836 T3B = FMA(T3n, T3m, T3j * T3q);
Chris@42 837 T3s = W[12];
Chris@42 838 T3w = W[13];
Chris@42 839 T3A = FMA(T3s, T3v, T3w * T3z);
Chris@42 840 T3C = FNMS(T3w, T3v, T3s * T3z);
Chris@42 841 }
Chris@42 842 Rp[WS(rs, 3)] = T3r - T3A;
Chris@42 843 Ip[WS(rs, 3)] = T3B + T3C;
Chris@42 844 Rm[WS(rs, 3)] = T3r + T3A;
Chris@42 845 Im[WS(rs, 3)] = T3C - T3B;
Chris@42 846 }
Chris@42 847 {
Chris@42 848 E T3H, T3N, T3M, T3O;
Chris@42 849 {
Chris@42 850 E T3D, T3F, T3I, T3K;
Chris@42 851 T3D = W[26];
Chris@42 852 T3F = W[27];
Chris@42 853 T3H = FNMS(T3F, T3G, T3D * T3E);
Chris@42 854 T3N = FMA(T3F, T3E, T3D * T3G);
Chris@42 855 T3I = W[28];
Chris@42 856 T3K = W[29];
Chris@42 857 T3M = FMA(T3I, T3J, T3K * T3L);
Chris@42 858 T3O = FNMS(T3K, T3J, T3I * T3L);
Chris@42 859 }
Chris@42 860 Rp[WS(rs, 7)] = T3H - T3M;
Chris@42 861 Ip[WS(rs, 7)] = T3N + T3O;
Chris@42 862 Rm[WS(rs, 7)] = T3H + T3M;
Chris@42 863 Im[WS(rs, 7)] = T3O - T3N;
Chris@42 864 }
Chris@42 865 }
Chris@42 866 }
Chris@42 867 }
Chris@42 868 }
Chris@42 869
Chris@42 870 static const tw_instr twinstr[] = {
Chris@42 871 {TW_FULL, 1, 16},
Chris@42 872 {TW_NEXT, 1, 0}
Chris@42 873 };
Chris@42 874
Chris@42 875 static const hc2c_desc desc = { 16, "hc2cbdft_16", twinstr, &GENUS, {168, 46, 38, 0} };
Chris@42 876
Chris@42 877 void X(codelet_hc2cbdft_16) (planner *p) {
Chris@42 878 X(khc2c_register) (p, hc2cbdft_16, &desc, HC2C_VIA_DFT);
Chris@42 879 }
Chris@42 880 #endif /* HAVE_FMA */