annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cb2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:39 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@42 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@42 33 * 112 stack variables, 3 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 41 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 42 {
Chris@42 43 INT m;
Chris@42 44 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 45 E Tv, TB, TF, Ty, T1J, T1O, T1N, T1K;
Chris@42 46 {
Chris@42 47 E Tw, T2z, T2C, Tx, T3f, T3l, T2F, T3r, Tz;
Chris@42 48 Tv = W[0];
Chris@42 49 Tw = W[2];
Chris@42 50 T2z = W[6];
Chris@42 51 T2C = W[7];
Chris@42 52 TB = W[4];
Chris@42 53 Tx = Tv * Tw;
Chris@42 54 T3f = Tv * T2z;
Chris@42 55 T3l = Tv * T2C;
Chris@42 56 T2F = Tv * TB;
Chris@42 57 T3r = Tw * TB;
Chris@42 58 TF = W[5];
Chris@42 59 Ty = W[1];
Chris@42 60 Tz = W[3];
Chris@42 61 {
Chris@42 62 E T2G, T3z, T3m, T3g, T3L, T3s, T1V, TA, T3w, T3Q, T30, T3C, TE, T1X, T1D;
Chris@42 63 E TG, T1G, T1o, T2p, T1Y, T2u, T2c, T1Z, TL, T1t, T2d, T3n, T35, T3R, T3F;
Chris@42 64 E T1w, T20, T3M, Tf, T3h, T2L, T2e, TW, T2Q, T36, T3I, T3N, T2V, T37, T1d;
Chris@42 65 E Tu, T3S, T18, T1z, T1i, T24, T2g, T27, T2h;
Chris@42 66 {
Chris@42 67 E T2K, TQ, TV, T2H;
Chris@42 68 {
Chris@42 69 E TH, T3, T32, T1s, T1p, T6, T33, TK, TM, Ta, TS, T2J, TP, TR, Td;
Chris@42 70 E TT, TI, TJ;
Chris@42 71 {
Chris@42 72 E T1q, T1r, T4, T5;
Chris@42 73 {
Chris@42 74 E T1, T1n, TC, T2b, T1W, T2, T3v, T2Z, TD;
Chris@42 75 T1 = Rp[0];
Chris@42 76 T3v = Tw * TF;
Chris@42 77 T2Z = Tv * TF;
Chris@42 78 T2G = FNMS(Ty, TF, T2F);
Chris@42 79 T3z = FMA(Ty, TF, T2F);
Chris@42 80 T3m = FNMS(Ty, T2z, T3l);
Chris@42 81 T3g = FMA(Ty, T2C, T3f);
Chris@42 82 T3L = FNMS(Tz, TF, T3r);
Chris@42 83 T3s = FMA(Tz, TF, T3r);
Chris@42 84 T1V = FMA(Ty, Tz, Tx);
Chris@42 85 TA = FNMS(Ty, Tz, Tx);
Chris@42 86 TD = Tv * Tz;
Chris@42 87 T3w = FNMS(Tz, TB, T3v);
Chris@42 88 T3Q = FMA(Tz, TB, T3v);
Chris@42 89 T30 = FMA(Ty, TB, T2Z);
Chris@42 90 T3C = FNMS(Ty, TB, T2Z);
Chris@42 91 T1n = TA * TF;
Chris@42 92 TC = TA * TB;
Chris@42 93 T2b = T1V * TF;
Chris@42 94 T1W = T1V * TB;
Chris@42 95 TE = FMA(Ty, Tw, TD);
Chris@42 96 T1X = FNMS(Ty, Tw, TD);
Chris@42 97 T2 = Rm[WS(rs, 7)];
Chris@42 98 T1q = Ip[0];
Chris@42 99 T1D = FMA(TE, TF, TC);
Chris@42 100 TG = FNMS(TE, TF, TC);
Chris@42 101 T1G = FNMS(TE, TB, T1n);
Chris@42 102 T1o = FMA(TE, TB, T1n);
Chris@42 103 T2p = FMA(T1X, TF, T1W);
Chris@42 104 T1Y = FNMS(T1X, TF, T1W);
Chris@42 105 T2u = FNMS(T1X, TB, T2b);
Chris@42 106 T2c = FMA(T1X, TB, T2b);
Chris@42 107 TH = T1 - T2;
Chris@42 108 T3 = T1 + T2;
Chris@42 109 T1r = Im[WS(rs, 7)];
Chris@42 110 }
Chris@42 111 T4 = Rp[WS(rs, 4)];
Chris@42 112 T5 = Rm[WS(rs, 3)];
Chris@42 113 TI = Ip[WS(rs, 4)];
Chris@42 114 T32 = T1q - T1r;
Chris@42 115 T1s = T1q + T1r;
Chris@42 116 T1p = T4 - T5;
Chris@42 117 T6 = T4 + T5;
Chris@42 118 TJ = Im[WS(rs, 3)];
Chris@42 119 }
Chris@42 120 {
Chris@42 121 E TN, TO, T8, T9, Tb, Tc;
Chris@42 122 T8 = Rp[WS(rs, 2)];
Chris@42 123 T9 = Rm[WS(rs, 5)];
Chris@42 124 TN = Ip[WS(rs, 2)];
Chris@42 125 T33 = TI - TJ;
Chris@42 126 TK = TI + TJ;
Chris@42 127 TM = T8 - T9;
Chris@42 128 Ta = T8 + T9;
Chris@42 129 TO = Im[WS(rs, 5)];
Chris@42 130 Tb = Rm[WS(rs, 1)];
Chris@42 131 Tc = Rp[WS(rs, 6)];
Chris@42 132 TS = Ip[WS(rs, 6)];
Chris@42 133 T2J = TN - TO;
Chris@42 134 TP = TN + TO;
Chris@42 135 TR = Tb - Tc;
Chris@42 136 Td = Tb + Tc;
Chris@42 137 TT = Im[WS(rs, 1)];
Chris@42 138 }
Chris@42 139 {
Chris@42 140 E T2I, TU, Te, T31, T34, T3D;
Chris@42 141 T1Z = TH + TK;
Chris@42 142 TL = TH - TK;
Chris@42 143 T1t = T1p + T1s;
Chris@42 144 T2d = T1s - T1p;
Chris@42 145 T2I = TS - TT;
Chris@42 146 TU = TS + TT;
Chris@42 147 Te = Ta + Td;
Chris@42 148 T31 = Ta - Td;
Chris@42 149 T34 = T32 - T33;
Chris@42 150 T3D = T32 + T33;
Chris@42 151 {
Chris@42 152 E T1u, T1v, T3E, T7;
Chris@42 153 T3E = T2J + T2I;
Chris@42 154 T2K = T2I - T2J;
Chris@42 155 TQ = TM - TP;
Chris@42 156 T1u = TM + TP;
Chris@42 157 T3n = T34 - T31;
Chris@42 158 T35 = T31 + T34;
Chris@42 159 TV = TR - TU;
Chris@42 160 T1v = TR + TU;
Chris@42 161 T3R = T3D - T3E;
Chris@42 162 T3F = T3D + T3E;
Chris@42 163 T2H = T3 - T6;
Chris@42 164 T7 = T3 + T6;
Chris@42 165 T1w = T1u - T1v;
Chris@42 166 T20 = T1u + T1v;
Chris@42 167 T3M = T7 - Te;
Chris@42 168 Tf = T7 + Te;
Chris@42 169 }
Chris@42 170 }
Chris@42 171 }
Chris@42 172 {
Chris@42 173 E T1e, Ti, T2N, T1c, T19, Tl, T2O, T1h, Tq, T13, Tp, T2S, T11, Tr, T14;
Chris@42 174 E T15;
Chris@42 175 {
Chris@42 176 E Tj, Tk, T1f, T1g;
Chris@42 177 {
Chris@42 178 E Tg, Th, T1a, T1b;
Chris@42 179 Tg = Rp[WS(rs, 1)];
Chris@42 180 T3h = T2H - T2K;
Chris@42 181 T2L = T2H + T2K;
Chris@42 182 T2e = TQ - TV;
Chris@42 183 TW = TQ + TV;
Chris@42 184 Th = Rm[WS(rs, 6)];
Chris@42 185 T1a = Ip[WS(rs, 1)];
Chris@42 186 T1b = Im[WS(rs, 6)];
Chris@42 187 Tj = Rp[WS(rs, 5)];
Chris@42 188 T1e = Tg - Th;
Chris@42 189 Ti = Tg + Th;
Chris@42 190 T2N = T1a - T1b;
Chris@42 191 T1c = T1a + T1b;
Chris@42 192 Tk = Rm[WS(rs, 2)];
Chris@42 193 T1f = Ip[WS(rs, 5)];
Chris@42 194 T1g = Im[WS(rs, 2)];
Chris@42 195 }
Chris@42 196 {
Chris@42 197 E Tn, To, TZ, T10;
Chris@42 198 Tn = Rm[0];
Chris@42 199 T19 = Tj - Tk;
Chris@42 200 Tl = Tj + Tk;
Chris@42 201 T2O = T1f - T1g;
Chris@42 202 T1h = T1f + T1g;
Chris@42 203 To = Rp[WS(rs, 7)];
Chris@42 204 TZ = Ip[WS(rs, 7)];
Chris@42 205 T10 = Im[0];
Chris@42 206 Tq = Rp[WS(rs, 3)];
Chris@42 207 T13 = Tn - To;
Chris@42 208 Tp = Tn + To;
Chris@42 209 T2S = TZ - T10;
Chris@42 210 T11 = TZ + T10;
Chris@42 211 Tr = Rm[WS(rs, 4)];
Chris@42 212 T14 = Ip[WS(rs, 3)];
Chris@42 213 T15 = Im[WS(rs, 4)];
Chris@42 214 }
Chris@42 215 }
Chris@42 216 {
Chris@42 217 E TY, T16, Tm, Tt;
Chris@42 218 {
Chris@42 219 E T2P, T3G, Ts, T2M, T3H, T2U, T2T, T2R;
Chris@42 220 T2P = T2N - T2O;
Chris@42 221 T3G = T2N + T2O;
Chris@42 222 TY = Tq - Tr;
Chris@42 223 Ts = Tq + Tr;
Chris@42 224 T2T = T14 - T15;
Chris@42 225 T16 = T14 + T15;
Chris@42 226 T2M = Ti - Tl;
Chris@42 227 Tm = Ti + Tl;
Chris@42 228 T3H = T2S + T2T;
Chris@42 229 T2U = T2S - T2T;
Chris@42 230 Tt = Tp + Ts;
Chris@42 231 T2R = Tp - Ts;
Chris@42 232 T2Q = T2M - T2P;
Chris@42 233 T36 = T2M + T2P;
Chris@42 234 T3I = T3G + T3H;
Chris@42 235 T3N = T3H - T3G;
Chris@42 236 T2V = T2R + T2U;
Chris@42 237 T37 = T2U - T2R;
Chris@42 238 }
Chris@42 239 {
Chris@42 240 E T25, T26, T22, T23, T12, T17;
Chris@42 241 T12 = TY - T11;
Chris@42 242 T25 = TY + T11;
Chris@42 243 T26 = T13 + T16;
Chris@42 244 T17 = T13 - T16;
Chris@42 245 T22 = T1c - T19;
Chris@42 246 T1d = T19 + T1c;
Chris@42 247 Tu = Tm + Tt;
Chris@42 248 T3S = Tm - Tt;
Chris@42 249 T18 = FNMS(KP414213562, T17, T12);
Chris@42 250 T1z = FMA(KP414213562, T12, T17);
Chris@42 251 T1i = T1e - T1h;
Chris@42 252 T23 = T1e + T1h;
Chris@42 253 T24 = FNMS(KP414213562, T23, T22);
Chris@42 254 T2g = FMA(KP414213562, T22, T23);
Chris@42 255 T27 = FNMS(KP414213562, T26, T25);
Chris@42 256 T2h = FMA(KP414213562, T25, T26);
Chris@42 257 }
Chris@42 258 }
Chris@42 259 }
Chris@42 260 }
Chris@42 261 {
Chris@42 262 E T1j, T1y, T3V, T3X, T3W, T38, T3i, T3o, T2W, T3K, T3B, T3A;
Chris@42 263 Rp[0] = Tf + Tu;
Chris@42 264 T3A = Tf - Tu;
Chris@42 265 T1j = FMA(KP414213562, T1i, T1d);
Chris@42 266 T1y = FNMS(KP414213562, T1d, T1i);
Chris@42 267 T3K = T3C * T3A;
Chris@42 268 T3B = T3z * T3A;
Chris@42 269 {
Chris@42 270 E T3O, T3T, T3J, T3P, T3U;
Chris@42 271 T3O = T3M - T3N;
Chris@42 272 T3V = T3M + T3N;
Chris@42 273 T3X = T3S + T3R;
Chris@42 274 T3T = T3R - T3S;
Chris@42 275 Rm[0] = T3F + T3I;
Chris@42 276 T3J = T3F - T3I;
Chris@42 277 T3P = T3L * T3O;
Chris@42 278 T3U = T3L * T3T;
Chris@42 279 T3W = TA * T3V;
Chris@42 280 Rp[WS(rs, 4)] = FNMS(T3C, T3J, T3B);
Chris@42 281 Rm[WS(rs, 4)] = FMA(T3z, T3J, T3K);
Chris@42 282 Rp[WS(rs, 6)] = FNMS(T3Q, T3T, T3P);
Chris@42 283 Rm[WS(rs, 6)] = FMA(T3Q, T3O, T3U);
Chris@42 284 T38 = T36 + T37;
Chris@42 285 T3i = T37 - T36;
Chris@42 286 T3o = T2Q - T2V;
Chris@42 287 T2W = T2Q + T2V;
Chris@42 288 }
Chris@42 289 {
Chris@42 290 E T2q, T21, T28, T2w, T2v, T2f, T2i, T2r;
Chris@42 291 {
Chris@42 292 E T2Y, T3a, T3c, T3d, T39, T3e, T3b, T2X, T3Y;
Chris@42 293 Rp[WS(rs, 2)] = FNMS(TE, T3X, T3W);
Chris@42 294 T3Y = TA * T3X;
Chris@42 295 {
Chris@42 296 E T3t, T3j, T3x, T3p;
Chris@42 297 T3t = FMA(KP707106781, T3i, T3h);
Chris@42 298 T3j = FNMS(KP707106781, T3i, T3h);
Chris@42 299 T3x = FMA(KP707106781, T3o, T3n);
Chris@42 300 T3p = FNMS(KP707106781, T3o, T3n);
Chris@42 301 Rm[WS(rs, 2)] = FMA(TE, T3V, T3Y);
Chris@42 302 {
Chris@42 303 E T3u, T3k, T3y, T3q;
Chris@42 304 T3u = T3s * T3t;
Chris@42 305 T3k = T3g * T3j;
Chris@42 306 T3y = T3s * T3x;
Chris@42 307 T3q = T3g * T3p;
Chris@42 308 Rp[WS(rs, 3)] = FNMS(T3w, T3x, T3u);
Chris@42 309 Rp[WS(rs, 7)] = FNMS(T3m, T3p, T3k);
Chris@42 310 Rm[WS(rs, 3)] = FMA(T3w, T3t, T3y);
Chris@42 311 Rm[WS(rs, 7)] = FMA(T3m, T3j, T3q);
Chris@42 312 T3b = FMA(KP707106781, T2W, T2L);
Chris@42 313 T2X = FNMS(KP707106781, T2W, T2L);
Chris@42 314 }
Chris@42 315 }
Chris@42 316 T2Y = T2G * T2X;
Chris@42 317 T3a = T30 * T2X;
Chris@42 318 T3c = T1V * T3b;
Chris@42 319 T3d = FMA(KP707106781, T38, T35);
Chris@42 320 T39 = FNMS(KP707106781, T38, T35);
Chris@42 321 T3e = T1X * T3b;
Chris@42 322 T2q = FMA(KP707106781, T20, T1Z);
Chris@42 323 T21 = FNMS(KP707106781, T20, T1Z);
Chris@42 324 Rp[WS(rs, 1)] = FNMS(T1X, T3d, T3c);
Chris@42 325 Rm[WS(rs, 5)] = FMA(T2G, T39, T3a);
Chris@42 326 Rp[WS(rs, 5)] = FNMS(T30, T39, T2Y);
Chris@42 327 Rm[WS(rs, 1)] = FMA(T1V, T3d, T3e);
Chris@42 328 T28 = T24 + T27;
Chris@42 329 T2w = T27 - T24;
Chris@42 330 T2v = FNMS(KP707106781, T2e, T2d);
Chris@42 331 T2f = FMA(KP707106781, T2e, T2d);
Chris@42 332 T2i = T2g - T2h;
Chris@42 333 T2r = T2g + T2h;
Chris@42 334 }
Chris@42 335 {
Chris@42 336 E TX, T1k, T1x, T1A;
Chris@42 337 T1J = FMA(KP707106781, TW, TL);
Chris@42 338 TX = FNMS(KP707106781, TW, TL);
Chris@42 339 {
Chris@42 340 E T2l, T29, T2n, T2j;
Chris@42 341 T2l = FNMS(KP923879532, T28, T21);
Chris@42 342 T29 = FMA(KP923879532, T28, T21);
Chris@42 343 T2n = FMA(KP923879532, T2i, T2f);
Chris@42 344 T2j = FNMS(KP923879532, T2i, T2f);
Chris@42 345 {
Chris@42 346 E T2o, T2m, T2k, T2a;
Chris@42 347 T2o = Tz * T2l;
Chris@42 348 T2m = Tw * T2l;
Chris@42 349 T2k = T2c * T29;
Chris@42 350 T2a = T1Y * T29;
Chris@42 351 Im[WS(rs, 1)] = FMA(Tw, T2n, T2o);
Chris@42 352 Ip[WS(rs, 1)] = FNMS(Tz, T2n, T2m);
Chris@42 353 Im[WS(rs, 5)] = FMA(T1Y, T2j, T2k);
Chris@42 354 Ip[WS(rs, 5)] = FNMS(T2c, T2j, T2a);
Chris@42 355 T1k = T18 - T1j;
Chris@42 356 T1O = T1j + T18;
Chris@42 357 }
Chris@42 358 }
Chris@42 359 T1N = FMA(KP707106781, T1w, T1t);
Chris@42 360 T1x = FNMS(KP707106781, T1w, T1t);
Chris@42 361 T1A = T1y - T1z;
Chris@42 362 T1K = T1y + T1z;
Chris@42 363 {
Chris@42 364 E T1E, T1l, T1H, T1B;
Chris@42 365 T1E = FMA(KP923879532, T1k, TX);
Chris@42 366 T1l = FNMS(KP923879532, T1k, TX);
Chris@42 367 T1H = FMA(KP923879532, T1A, T1x);
Chris@42 368 T1B = FNMS(KP923879532, T1A, T1x);
Chris@42 369 {
Chris@42 370 E T1I, T1F, T1C, T1m;
Chris@42 371 T1I = T1G * T1E;
Chris@42 372 T1F = T1D * T1E;
Chris@42 373 T1C = T1o * T1l;
Chris@42 374 T1m = TG * T1l;
Chris@42 375 Im[WS(rs, 2)] = FMA(T1D, T1H, T1I);
Chris@42 376 Ip[WS(rs, 2)] = FNMS(T1G, T1H, T1F);
Chris@42 377 Im[WS(rs, 6)] = FMA(TG, T1B, T1C);
Chris@42 378 Ip[WS(rs, 6)] = FNMS(T1o, T1B, T1m);
Chris@42 379 }
Chris@42 380 }
Chris@42 381 {
Chris@42 382 E T2A, T2s, T2D, T2x;
Chris@42 383 T2A = FMA(KP923879532, T2r, T2q);
Chris@42 384 T2s = FNMS(KP923879532, T2r, T2q);
Chris@42 385 T2D = FNMS(KP923879532, T2w, T2v);
Chris@42 386 T2x = FMA(KP923879532, T2w, T2v);
Chris@42 387 {
Chris@42 388 E T2B, T2t, T2E, T2y;
Chris@42 389 T2B = T2z * T2A;
Chris@42 390 T2t = T2p * T2s;
Chris@42 391 T2E = T2z * T2D;
Chris@42 392 T2y = T2p * T2x;
Chris@42 393 Ip[WS(rs, 7)] = FNMS(T2C, T2D, T2B);
Chris@42 394 Ip[WS(rs, 3)] = FNMS(T2u, T2x, T2t);
Chris@42 395 Im[WS(rs, 7)] = FMA(T2C, T2A, T2E);
Chris@42 396 Im[WS(rs, 3)] = FMA(T2u, T2s, T2y);
Chris@42 397 }
Chris@42 398 }
Chris@42 399 }
Chris@42 400 }
Chris@42 401 }
Chris@42 402 }
Chris@42 403 }
Chris@42 404 {
Chris@42 405 E T1L, T1R, T1P, T1T;
Chris@42 406 T1L = FNMS(KP923879532, T1K, T1J);
Chris@42 407 T1R = FMA(KP923879532, T1K, T1J);
Chris@42 408 T1P = FNMS(KP923879532, T1O, T1N);
Chris@42 409 T1T = FMA(KP923879532, T1O, T1N);
Chris@42 410 {
Chris@42 411 E T1S, T1M, T1U, T1Q;
Chris@42 412 T1S = Tv * T1R;
Chris@42 413 T1M = TB * T1L;
Chris@42 414 T1U = Tv * T1T;
Chris@42 415 T1Q = TB * T1P;
Chris@42 416 Ip[0] = FNMS(Ty, T1T, T1S);
Chris@42 417 Ip[WS(rs, 4)] = FNMS(TF, T1P, T1M);
Chris@42 418 Im[0] = FMA(Ty, T1R, T1U);
Chris@42 419 Im[WS(rs, 4)] = FMA(TF, T1L, T1Q);
Chris@42 420 }
Chris@42 421 }
Chris@42 422 }
Chris@42 423 }
Chris@42 424 }
Chris@42 425
Chris@42 426 static const tw_instr twinstr[] = {
Chris@42 427 {TW_CEXP, 1, 1},
Chris@42 428 {TW_CEXP, 1, 3},
Chris@42 429 {TW_CEXP, 1, 9},
Chris@42 430 {TW_CEXP, 1, 15},
Chris@42 431 {TW_NEXT, 1, 0}
Chris@42 432 };
Chris@42 433
Chris@42 434 static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, {104, 42, 92, 0} };
Chris@42 435
Chris@42 436 void X(codelet_hc2cb2_16) (planner *p) {
Chris@42 437 X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
Chris@42 438 }
Chris@42 439 #else /* HAVE_FMA */
Chris@42 440
Chris@42 441 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hc2cb2_16 -include hc2cb.h */
Chris@42 442
Chris@42 443 /*
Chris@42 444 * This function contains 196 FP additions, 108 FP multiplications,
Chris@42 445 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@42 446 * 80 stack variables, 3 constants, and 64 memory accesses
Chris@42 447 */
Chris@42 448 #include "hc2cb.h"
Chris@42 449
Chris@42 450 static void hc2cb2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 451 {
Chris@42 452 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 453 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 454 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 455 {
Chris@42 456 INT m;
Chris@42 457 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 458 E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
Chris@42 459 E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
Chris@42 460 {
Chris@42 461 E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
Chris@42 462 {
Chris@42 463 E T1m, T1s, T1o, T1r;
Chris@42 464 Tv = W[0];
Chris@42 465 Ty = W[1];
Chris@42 466 T1l = W[2];
Chris@42 467 T1n = W[3];
Chris@42 468 T1m = Tv * T1l;
Chris@42 469 T1s = Ty * T1l;
Chris@42 470 T1o = Ty * T1n;
Chris@42 471 T1r = Tv * T1n;
Chris@42 472 T1p = T1m + T1o;
Chris@42 473 T1t = T1r - T1s;
Chris@42 474 T27 = T1r + T1s;
Chris@42 475 T25 = T1m - T1o;
Chris@42 476 Tz = W[5];
Chris@42 477 TA = Ty * Tz;
Chris@42 478 T1J = T1l * Tz;
Chris@42 479 T15 = Tv * Tz;
Chris@42 480 T1G = T1n * Tz;
Chris@42 481 Tw = W[4];
Chris@42 482 Tx = Tv * Tw;
Chris@42 483 T1K = T1n * Tw;
Chris@42 484 T16 = Ty * Tw;
Chris@42 485 T1F = T1l * Tw;
Chris@42 486 }
Chris@42 487 TB = Tx - TA;
Chris@42 488 T21 = T1J + T1K;
Chris@42 489 T1P = T15 - T16;
Chris@42 490 T1H = T1F + T1G;
Chris@42 491 T1X = T1F - T1G;
Chris@42 492 T17 = T15 + T16;
Chris@42 493 T1L = T1J - T1K;
Chris@42 494 T1N = Tx + TA;
Chris@42 495 T1v = W[6];
Chris@42 496 T1w = W[7];
Chris@42 497 T1x = FMA(Tv, T1v, Ty * T1w);
Chris@42 498 T1B = FNMS(Ty, T1v, Tv * T1w);
Chris@42 499 {
Chris@42 500 E T2D, T2E, T29, T2a;
Chris@42 501 T2D = T25 * Tz;
Chris@42 502 T2E = T27 * Tw;
Chris@42 503 T2F = T2D + T2E;
Chris@42 504 T2T = T2D - T2E;
Chris@42 505 T29 = T25 * Tw;
Chris@42 506 T2a = T27 * Tz;
Chris@42 507 T2b = T29 - T2a;
Chris@42 508 T2R = T29 + T2a;
Chris@42 509 }
Chris@42 510 {
Chris@42 511 E T3h, T3i, T33, T34;
Chris@42 512 T3h = T1p * Tz;
Chris@42 513 T3i = T1t * Tw;
Chris@42 514 T3j = T3h + T3i;
Chris@42 515 T3x = T3h - T3i;
Chris@42 516 T33 = T1p * Tw;
Chris@42 517 T34 = T1t * Tz;
Chris@42 518 T35 = T33 - T34;
Chris@42 519 T3t = T33 + T34;
Chris@42 520 }
Chris@42 521 }
Chris@42 522 {
Chris@42 523 E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
Chris@42 524 E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
Chris@42 525 E T3e, T3o;
Chris@42 526 {
Chris@42 527 E T3, T2c, T1b, T2H, T6, T2G, T1e, T2d;
Chris@42 528 {
Chris@42 529 E T1, T2, T19, T1a;
Chris@42 530 T1 = Rp[0];
Chris@42 531 T2 = Rm[WS(rs, 7)];
Chris@42 532 T3 = T1 + T2;
Chris@42 533 T2c = T1 - T2;
Chris@42 534 T19 = Ip[0];
Chris@42 535 T1a = Im[WS(rs, 7)];
Chris@42 536 T1b = T19 - T1a;
Chris@42 537 T2H = T19 + T1a;
Chris@42 538 }
Chris@42 539 {
Chris@42 540 E T4, T5, T1c, T1d;
Chris@42 541 T4 = Rp[WS(rs, 4)];
Chris@42 542 T5 = Rm[WS(rs, 3)];
Chris@42 543 T6 = T4 + T5;
Chris@42 544 T2G = T4 - T5;
Chris@42 545 T1c = Ip[WS(rs, 4)];
Chris@42 546 T1d = Im[WS(rs, 3)];
Chris@42 547 T1e = T1c - T1d;
Chris@42 548 T2d = T1c + T1d;
Chris@42 549 }
Chris@42 550 T7 = T3 + T6;
Chris@42 551 T36 = T2c + T2d;
Chris@42 552 T3k = T2H - T2G;
Chris@42 553 TC = T3 - T6;
Chris@42 554 T1f = T1b - T1e;
Chris@42 555 T2e = T2c - T2d;
Chris@42 556 T2I = T2G + T2H;
Chris@42 557 T1Q = T1b + T1e;
Chris@42 558 }
Chris@42 559 {
Chris@42 560 E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
Chris@42 561 {
Chris@42 562 E T8, T9, TG, TH;
Chris@42 563 T8 = Rp[WS(rs, 2)];
Chris@42 564 T9 = Rm[WS(rs, 5)];
Chris@42 565 Ta = T8 + T9;
Chris@42 566 T2f = T8 - T9;
Chris@42 567 TG = Ip[WS(rs, 2)];
Chris@42 568 TH = Im[WS(rs, 5)];
Chris@42 569 TI = TG - TH;
Chris@42 570 T2g = TG + TH;
Chris@42 571 }
Chris@42 572 {
Chris@42 573 E Tb, Tc, TD, TE;
Chris@42 574 Tb = Rm[WS(rs, 1)];
Chris@42 575 Tc = Rp[WS(rs, 6)];
Chris@42 576 Td = Tb + Tc;
Chris@42 577 T2i = Tb - Tc;
Chris@42 578 TD = Ip[WS(rs, 6)];
Chris@42 579 TE = Im[WS(rs, 1)];
Chris@42 580 TF = TD - TE;
Chris@42 581 T2j = TD + TE;
Chris@42 582 }
Chris@42 583 Te = Ta + Td;
Chris@42 584 TJ = TF - TI;
Chris@42 585 T1R = TI + TF;
Chris@42 586 T18 = Ta - Td;
Chris@42 587 {
Chris@42 588 E T2J, T2K, T2h, T2k;
Chris@42 589 T2J = T2f + T2g;
Chris@42 590 T2K = T2i + T2j;
Chris@42 591 T2L = KP707106781 * (T2J - T2K);
Chris@42 592 T37 = KP707106781 * (T2J + T2K);
Chris@42 593 T2h = T2f - T2g;
Chris@42 594 T2k = T2i - T2j;
Chris@42 595 T2l = KP707106781 * (T2h + T2k);
Chris@42 596 T3l = KP707106781 * (T2h - T2k);
Chris@42 597 }
Chris@42 598 }
Chris@42 599 {
Chris@42 600 E Ti, T2x, TO, T2v, Tl, T2u, TR, T2y, TL, TS;
Chris@42 601 {
Chris@42 602 E Tg, Th, TM, TN;
Chris@42 603 Tg = Rp[WS(rs, 1)];
Chris@42 604 Th = Rm[WS(rs, 6)];
Chris@42 605 Ti = Tg + Th;
Chris@42 606 T2x = Tg - Th;
Chris@42 607 TM = Ip[WS(rs, 1)];
Chris@42 608 TN = Im[WS(rs, 6)];
Chris@42 609 TO = TM - TN;
Chris@42 610 T2v = TM + TN;
Chris@42 611 }
Chris@42 612 {
Chris@42 613 E Tj, Tk, TP, TQ;
Chris@42 614 Tj = Rp[WS(rs, 5)];
Chris@42 615 Tk = Rm[WS(rs, 2)];
Chris@42 616 Tl = Tj + Tk;
Chris@42 617 T2u = Tj - Tk;
Chris@42 618 TP = Ip[WS(rs, 5)];
Chris@42 619 TQ = Im[WS(rs, 2)];
Chris@42 620 TR = TP - TQ;
Chris@42 621 T2y = TP + TQ;
Chris@42 622 }
Chris@42 623 Tm = Ti + Tl;
Chris@42 624 T1T = TO + TR;
Chris@42 625 TL = Ti - Tl;
Chris@42 626 TS = TO - TR;
Chris@42 627 TT = TL - TS;
Chris@42 628 T1h = TL + TS;
Chris@42 629 {
Chris@42 630 E T2w, T2z, T39, T3a;
Chris@42 631 T2w = T2u + T2v;
Chris@42 632 T2z = T2x - T2y;
Chris@42 633 T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
Chris@42 634 T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
Chris@42 635 T39 = T2x + T2y;
Chris@42 636 T3a = T2v - T2u;
Chris@42 637 T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
Chris@42 638 T3n = FMA(KP382683432, T3a, KP923879532 * T39);
Chris@42 639 }
Chris@42 640 }
Chris@42 641 {
Chris@42 642 E Tp, T2q, TX, T2o, Ts, T2n, T10, T2r, TU, T11;
Chris@42 643 {
Chris@42 644 E Tn, To, TV, TW;
Chris@42 645 Tn = Rm[0];
Chris@42 646 To = Rp[WS(rs, 7)];
Chris@42 647 Tp = Tn + To;
Chris@42 648 T2q = Tn - To;
Chris@42 649 TV = Ip[WS(rs, 7)];
Chris@42 650 TW = Im[0];
Chris@42 651 TX = TV - TW;
Chris@42 652 T2o = TV + TW;
Chris@42 653 }
Chris@42 654 {
Chris@42 655 E Tq, Tr, TY, TZ;
Chris@42 656 Tq = Rp[WS(rs, 3)];
Chris@42 657 Tr = Rm[WS(rs, 4)];
Chris@42 658 Ts = Tq + Tr;
Chris@42 659 T2n = Tq - Tr;
Chris@42 660 TY = Ip[WS(rs, 3)];
Chris@42 661 TZ = Im[WS(rs, 4)];
Chris@42 662 T10 = TY - TZ;
Chris@42 663 T2r = TY + TZ;
Chris@42 664 }
Chris@42 665 Tt = Tp + Ts;
Chris@42 666 T1U = TX + T10;
Chris@42 667 TU = Tp - Ts;
Chris@42 668 T11 = TX - T10;
Chris@42 669 T12 = TU + T11;
Chris@42 670 T1i = T11 - TU;
Chris@42 671 {
Chris@42 672 E T2p, T2s, T3c, T3d;
Chris@42 673 T2p = T2n - T2o;
Chris@42 674 T2s = T2q - T2r;
Chris@42 675 T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
Chris@42 676 T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
Chris@42 677 T3c = T2q + T2r;
Chris@42 678 T3d = T2n + T2o;
Chris@42 679 T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
Chris@42 680 T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
Chris@42 681 }
Chris@42 682 }
Chris@42 683 {
Chris@42 684 E Tf, Tu, T1O, T1S, T1V, T1W;
Chris@42 685 Tf = T7 + Te;
Chris@42 686 Tu = Tm + Tt;
Chris@42 687 T1O = Tf - Tu;
Chris@42 688 T1S = T1Q + T1R;
Chris@42 689 T1V = T1T + T1U;
Chris@42 690 T1W = T1S - T1V;
Chris@42 691 Rp[0] = Tf + Tu;
Chris@42 692 Rm[0] = T1S + T1V;
Chris@42 693 Rp[WS(rs, 4)] = FNMS(T1P, T1W, T1N * T1O);
Chris@42 694 Rm[WS(rs, 4)] = FMA(T1P, T1O, T1N * T1W);
Chris@42 695 }
Chris@42 696 {
Chris@42 697 E T3g, T3r, T3q, T3s;
Chris@42 698 {
Chris@42 699 E T38, T3f, T3m, T3p;
Chris@42 700 T38 = T36 - T37;
Chris@42 701 T3f = T3b + T3e;
Chris@42 702 T3g = T38 - T3f;
Chris@42 703 T3r = T38 + T3f;
Chris@42 704 T3m = T3k + T3l;
Chris@42 705 T3p = T3n - T3o;
Chris@42 706 T3q = T3m - T3p;
Chris@42 707 T3s = T3m + T3p;
Chris@42 708 }
Chris@42 709 Ip[WS(rs, 5)] = FNMS(T3j, T3q, T35 * T3g);
Chris@42 710 Im[WS(rs, 5)] = FMA(T3j, T3g, T35 * T3q);
Chris@42 711 Ip[WS(rs, 1)] = FNMS(T1n, T3s, T1l * T3r);
Chris@42 712 Im[WS(rs, 1)] = FMA(T1n, T3r, T1l * T3s);
Chris@42 713 }
Chris@42 714 {
Chris@42 715 E T3w, T3B, T3A, T3C;
Chris@42 716 {
Chris@42 717 E T3u, T3v, T3y, T3z;
Chris@42 718 T3u = T36 + T37;
Chris@42 719 T3v = T3n + T3o;
Chris@42 720 T3w = T3u - T3v;
Chris@42 721 T3B = T3u + T3v;
Chris@42 722 T3y = T3k - T3l;
Chris@42 723 T3z = T3b - T3e;
Chris@42 724 T3A = T3y + T3z;
Chris@42 725 T3C = T3y - T3z;
Chris@42 726 }
Chris@42 727 Ip[WS(rs, 3)] = FNMS(T3x, T3A, T3t * T3w);
Chris@42 728 Im[WS(rs, 3)] = FMA(T3t, T3A, T3x * T3w);
Chris@42 729 Ip[WS(rs, 7)] = FNMS(T1w, T3C, T1v * T3B);
Chris@42 730 Im[WS(rs, 7)] = FMA(T1v, T3C, T1w * T3B);
Chris@42 731 }
Chris@42 732 {
Chris@42 733 E T14, T1q, T1k, T1u;
Chris@42 734 {
Chris@42 735 E TK, T13, T1g, T1j;
Chris@42 736 TK = TC + TJ;
Chris@42 737 T13 = KP707106781 * (TT + T12);
Chris@42 738 T14 = TK - T13;
Chris@42 739 T1q = TK + T13;
Chris@42 740 T1g = T18 + T1f;
Chris@42 741 T1j = KP707106781 * (T1h + T1i);
Chris@42 742 T1k = T1g - T1j;
Chris@42 743 T1u = T1g + T1j;
Chris@42 744 }
Chris@42 745 Rp[WS(rs, 5)] = FNMS(T17, T1k, TB * T14);
Chris@42 746 Rm[WS(rs, 5)] = FMA(T17, T14, TB * T1k);
Chris@42 747 Rp[WS(rs, 1)] = FNMS(T1t, T1u, T1p * T1q);
Chris@42 748 Rm[WS(rs, 1)] = FMA(T1t, T1q, T1p * T1u);
Chris@42 749 }
Chris@42 750 {
Chris@42 751 E T1A, T1I, T1E, T1M;
Chris@42 752 {
Chris@42 753 E T1y, T1z, T1C, T1D;
Chris@42 754 T1y = TC - TJ;
Chris@42 755 T1z = KP707106781 * (T1i - T1h);
Chris@42 756 T1A = T1y - T1z;
Chris@42 757 T1I = T1y + T1z;
Chris@42 758 T1C = T1f - T18;
Chris@42 759 T1D = KP707106781 * (TT - T12);
Chris@42 760 T1E = T1C - T1D;
Chris@42 761 T1M = T1C + T1D;
Chris@42 762 }
Chris@42 763 Rp[WS(rs, 7)] = FNMS(T1B, T1E, T1x * T1A);
Chris@42 764 Rm[WS(rs, 7)] = FMA(T1x, T1E, T1B * T1A);
Chris@42 765 Rp[WS(rs, 3)] = FNMS(T1L, T1M, T1H * T1I);
Chris@42 766 Rm[WS(rs, 3)] = FMA(T1H, T1M, T1L * T1I);
Chris@42 767 }
Chris@42 768 {
Chris@42 769 E T2C, T2S, T2Q, T2U;
Chris@42 770 {
Chris@42 771 E T2m, T2B, T2M, T2P;
Chris@42 772 T2m = T2e - T2l;
Chris@42 773 T2B = T2t - T2A;
Chris@42 774 T2C = T2m - T2B;
Chris@42 775 T2S = T2m + T2B;
Chris@42 776 T2M = T2I - T2L;
Chris@42 777 T2P = T2N - T2O;
Chris@42 778 T2Q = T2M - T2P;
Chris@42 779 T2U = T2M + T2P;
Chris@42 780 }
Chris@42 781 Ip[WS(rs, 6)] = FNMS(T2F, T2Q, T2b * T2C);
Chris@42 782 Im[WS(rs, 6)] = FMA(T2F, T2C, T2b * T2Q);
Chris@42 783 Ip[WS(rs, 2)] = FNMS(T2T, T2U, T2R * T2S);
Chris@42 784 Im[WS(rs, 2)] = FMA(T2T, T2S, T2R * T2U);
Chris@42 785 }
Chris@42 786 {
Chris@42 787 E T2X, T31, T30, T32;
Chris@42 788 {
Chris@42 789 E T2V, T2W, T2Y, T2Z;
Chris@42 790 T2V = T2e + T2l;
Chris@42 791 T2W = T2N + T2O;
Chris@42 792 T2X = T2V - T2W;
Chris@42 793 T31 = T2V + T2W;
Chris@42 794 T2Y = T2I + T2L;
Chris@42 795 T2Z = T2A + T2t;
Chris@42 796 T30 = T2Y - T2Z;
Chris@42 797 T32 = T2Y + T2Z;
Chris@42 798 }
Chris@42 799 Ip[WS(rs, 4)] = FNMS(Tz, T30, Tw * T2X);
Chris@42 800 Im[WS(rs, 4)] = FMA(Tw, T30, Tz * T2X);
Chris@42 801 Ip[0] = FNMS(Ty, T32, Tv * T31);
Chris@42 802 Im[0] = FMA(Tv, T32, Ty * T31);
Chris@42 803 }
Chris@42 804 {
Chris@42 805 E T20, T26, T24, T28;
Chris@42 806 {
Chris@42 807 E T1Y, T1Z, T22, T23;
Chris@42 808 T1Y = T7 - Te;
Chris@42 809 T1Z = T1U - T1T;
Chris@42 810 T20 = T1Y - T1Z;
Chris@42 811 T26 = T1Y + T1Z;
Chris@42 812 T22 = T1Q - T1R;
Chris@42 813 T23 = Tm - Tt;
Chris@42 814 T24 = T22 - T23;
Chris@42 815 T28 = T23 + T22;
Chris@42 816 }
Chris@42 817 Rp[WS(rs, 6)] = FNMS(T21, T24, T1X * T20);
Chris@42 818 Rm[WS(rs, 6)] = FMA(T1X, T24, T21 * T20);
Chris@42 819 Rp[WS(rs, 2)] = FNMS(T27, T28, T25 * T26);
Chris@42 820 Rm[WS(rs, 2)] = FMA(T25, T28, T27 * T26);
Chris@42 821 }
Chris@42 822 }
Chris@42 823 }
Chris@42 824 }
Chris@42 825 }
Chris@42 826
Chris@42 827 static const tw_instr twinstr[] = {
Chris@42 828 {TW_CEXP, 1, 1},
Chris@42 829 {TW_CEXP, 1, 3},
Chris@42 830 {TW_CEXP, 1, 9},
Chris@42 831 {TW_CEXP, 1, 15},
Chris@42 832 {TW_NEXT, 1, 0}
Chris@42 833 };
Chris@42 834
Chris@42 835 static const hc2c_desc desc = { 16, "hc2cb2_16", twinstr, &GENUS, {156, 68, 40, 0} };
Chris@42 836
Chris@42 837 void X(codelet_hc2cb2_16) (planner *p) {
Chris@42 838 X(khc2c_register) (p, hc2cb2_16, &desc, HC2C_VIA_RDFT);
Chris@42 839 }
Chris@42 840 #endif /* HAVE_FMA */